In [1]:
import tempfile

import pandas as pd
import csv
import numpy as np
import tensorflow as tf


In [2]:
#Load data and reduce the number of features for the baseline models

train = pd.read_csv('../../wl2522/project/dataset/train.csv', dtype={'source_system_tab': str})
test = pd.read_csv('../../wl2522/project/dataset/test.csv', dtype={'source_system_tab': str })
members = pd.read_csv('../../wl2522/project/dataset/members.csv', dtype={'msno': str, 'city': str, 'registered_via': str})
songs = pd.read_csv('../../wl2522/project/dataset/songs.csv', dtype={'genre_ids': str, 'language': str, 'song_length': int})

songs.drop(['composer', 'lyricist'], axis=1, inplace=True)

#Infer a missing value based on other features

songs.loc[605127, 'language'] = '31.0'

#Impute missing values

train.fillna(value='unknown', axis=1, inplace=True)
test.fillna(value='unknown', axis=1, inplace=True)
members.fillna(value='unknown', axis=1, inplace=True)
songs.fillna(value='unknown', axis=1, inplace=True)

#Convert feature columns related to member registration to datetime format

members['expiration_date'] = pd.to_datetime(members['expiration_date'], format='%Y%m%d')
members['registration_init_time'] = pd.to_datetime(members['registration_init_time'], format='%Y%m%d')

#Create a new feature indicating the number of days a member was registered

members['reg_duration'] = (members['expiration_date'] - members['registration_init_time']).dt.days


FileNotFoundError: File b'../../wl2522/project/dataset/train.csv' does not exist

In [4]:
#Create lists of genre ids in the genre id column

#genres = songs['genre_ids'].str.split('|')

#Create a dataframe that stores genre IDs across multiple columns (one genre per column)

#genres = genres.apply(pd.Series).add_prefix('genre_')
#genres.to_csv('dataset/genres.csv', index=False)

genres = pd.read_csv('../../wl2522/project/dataset/genres.csv', dtype=str)
#Merge the training and test data with song and member data

train_set = train.merge(songs, on='song_id')
train_set = train_set.merge(members, on='msno')
test_set = test.merge(songs, on='song_id', how='left')
test_set = test_set.merge(members, on='msno', how='left')

#Separate the submission ids from the test set

ids = test_set['id']
test_set.drop('id', axis=1, inplace=True)

#Impute missing values in merged training and test sets

train_set.fillna(value='unknown', axis=1, inplace=True)
test_set.fillna(value='unknown', axis=1, inplace=True)

#Impute missing song lengths with an integer value to avoid errors due to conflicting data types

test_set['song_length'] = test_set['song_length'].replace('unknown', -99)

msno_vocabs = train_set['msno'].unique()
msno_map = { msno_vocabs[i]:i+1 for i in range(len(msno_vocabs))}
train_set['msno'] = train_set['msno'].map(msno_map).fillna(3).astype(int)
test_set['msno'] = test_set['msno'].map(msno_map).fillna(3).astype(int)

song_vocabs = train_set['song_id'].unique()
song_map = { song_vocabs[i]:i+1 for i in range(len(song_vocabs))}
train_set['song_id'] = train_set['song_id'].map(song_map).fillna(3).astype(int)
test_set['song_id'] = test_set['song_id'].map(song_map).fillna(3).astype(int)

tab_vocabs = train_set['source_system_tab'].unique()
tab_map = { tab_vocabs[i]:i+1 for i in range(len(tab_vocabs))}
train_set['source_system_tab'] = train_set['source_system_tab'].map(tab_map).fillna(3).astype(int)
test_set['source_system_tab'] = test_set['source_system_tab'].map(tab_map).fillna(3).astype(int)

screen_vocabs = train_set['source_screen_name'].unique()
screen_map = { screen_vocabs[i]:i+1 for i in range(len(screen_vocabs))}
train_set['source_screen_name'] = train_set['source_screen_name'].map(screen_map).fillna(3).astype(int)
test_set['source_screen_name'] = test_set['source_screen_name'].map(screen_map).fillna(3).astype(int)

sctype_vocabs = train_set['source_type'].unique()
sctype_map = { sctype_vocabs[i]:i+1 for i in range(len(sctype_vocabs))}
train_set['source_type'] = train_set['source_type'].map(sctype_map).fillna(3).astype(int)
test_set['source_type'] = test_set['source_type'].map(sctype_map).fillna(3).astype(int)

gid_vocabs = train_set['genre_ids'].unique()
gid_map = { gid_vocabs[i]:i+1 for i in range(len(gid_vocabs))}
train_set['genre_ids'] = train_set['genre_ids'].map(gid_map).fillna(3).astype(int)
test_set['genre_ids'] = test_set['genre_ids'].map(gid_map).fillna(3).astype(int)

atname_vocabs = train_set['artist_name'].unique()
atname_map = { atname_vocabs[i]:i+1 for i in range(len(atname_vocabs))}
train_set['artist_name'] = train_set['artist_name'].map(atname_map).fillna(3).astype(int)
test_set['artist_name'] = test_set['artist_name'].map(atname_map).fillna(3).astype(int)

gender_vocabs = train_set['gender'].unique()
gender_map = { gender_vocabs[i]:i+1 for i in range(len(gender_vocabs))}
train_set['gender'] = train_set['gender'].map(gender_map).fillna(3).astype(int)
test_set['gender'] = test_set['gender'].map(gender_map).fillna(3).astype(int)

via_vocabs = train_set['registered_via'].unique()
via_map = { via_vocabs[i]:i+1 for i in range(len(via_vocabs))}
train_set['registered_via'] = train_set['registered_via'].map(via_map).fillna(3).astype(int)
test_set['registered_via'] = test_set['registered_via'].map(via_map).fillna(3).astype(int)

city_vocabs = train_set['city'].unique()
city_map = { city_vocabs[i]:i+1 for i in range(len(city_vocabs))}
train_set['city'] = train_set['city'].map(city_map).fillna(3).astype(int)
test_set['city'] = test_set['city'].map(city_map).fillna(3).astype(int)

lan_vocabs = train_set['language'].unique()
lan_map = { lan_vocabs[i]:i+1 for i in range(len(lan_vocabs))}
train_set['language'] = train_set['language'].map(lan_map).fillna(3).astype(int)
test_set['language'] = test_set['language'].map(lan_map).fillna(3).astype(int)

FileNotFoundError: File b'../../wl2522/project/dataset/genres.csv' does not exist

In [5]:
#Merge the training and test data with song and member data

train_set = train.merge(songs, on='song_id')
train_set = train_set.merge(members, on='msno')
test_set = test.merge(songs, on='song_id', how='left')
test_set = test_set.merge(members, on='msno', how='left')

#Separate the submission ids from the test set

ids = test_set['id']
test_set.drop('id', axis=1, inplace=True)

#Impute missing values in merged training and test sets

train_set.fillna(value='unknown', axis=1, inplace=True)
test_set.fillna(value='unknown', axis=1, inplace=True)

#Impute missing song lengths with an integer value to avoid errors due to conflicting data types

test_set['song_length'] = test_set['song_length'].replace('unknown', -99)

msno_vocabs = train_set['msno'].unique()
msno_map = { msno_vocabs[i]:i+1 for i in range(len(msno_vocabs))}
train_set['msno'] = train_set['msno'].map(msno_map).fillna(3).astype(int)
test_set['msno'] = test_set['msno'].map(msno_map).fillna(3).astype(int)

song_vocabs = train_set['song_id'].unique()
song_map = { song_vocabs[i]:i+1 for i in range(len(song_vocabs))}
train_set['song_id'] = train_set['song_id'].map(song_map).fillna(3).astype(int)
test_set['song_id'] = test_set['song_id'].map(song_map).fillna(3).astype(int)

tab_vocabs = train_set['source_system_tab'].unique()
tab_map = { tab_vocabs[i]:i+1 for i in range(len(tab_vocabs))}
train_set['source_system_tab'] = train_set['source_system_tab'].map(tab_map).fillna(3).astype(int)
test_set['source_system_tab'] = test_set['source_system_tab'].map(tab_map).fillna(3).astype(int)

screen_vocabs = train_set['source_screen_name'].unique()
screen_map = { screen_vocabs[i]:i+1 for i in range(len(screen_vocabs))}
train_set['source_screen_name'] = train_set['source_screen_name'].map(screen_map).fillna(3).astype(int)
test_set['source_screen_name'] = test_set['source_screen_name'].map(screen_map).fillna(3).astype(int)

sctype_vocabs = train_set['source_type'].unique()
sctype_map = { sctype_vocabs[i]:i+1 for i in range(len(sctype_vocabs))}
train_set['source_type'] = train_set['source_type'].map(sctype_map).fillna(3).astype(int)
test_set['source_type'] = test_set['source_type'].map(sctype_map).fillna(3).astype(int)

gid_vocabs = train_set['genre_ids'].unique()
gid_map = { gid_vocabs[i]:i+1 for i in range(len(gid_vocabs))}
train_set['genre_ids'] = train_set['genre_ids'].map(gid_map).fillna(3).astype(int)
test_set['genre_ids'] = test_set['genre_ids'].map(gid_map).fillna(3).astype(int)

atname_vocabs = train_set['artist_name'].unique()
atname_map = { atname_vocabs[i]:i+1 for i in range(len(atname_vocabs))}
train_set['artist_name'] = train_set['artist_name'].map(atname_map).fillna(3).astype(int)
test_set['artist_name'] = test_set['artist_name'].map(atname_map).fillna(3).astype(int)

gender_vocabs = train_set['gender'].unique()
gender_map = { gender_vocabs[i]:i+1 for i in range(len(gender_vocabs))}
train_set['gender'] = train_set['gender'].map(gender_map).fillna(3).astype(int)
test_set['gender'] = test_set['gender'].map(gender_map).fillna(3).astype(int)

via_vocabs = train_set['registered_via'].unique()
via_map = { via_vocabs[i]:i+1 for i in range(len(via_vocabs))}
train_set['registered_via'] = train_set['registered_via'].map(via_map).fillna(3).astype(int)
test_set['registered_via'] = test_set['registered_via'].map(via_map).fillna(3).astype(int)

city_vocabs = train_set['city'].unique()
city_map = { city_vocabs[i]:i+1 for i in range(len(city_vocabs))}
train_set['city'] = train_set['city'].map(city_map).fillna(3).astype(int)
test_set['city'] = test_set['city'].map(city_map).fillna(3).astype(int)

lan_vocabs = train_set['language'].unique()
lan_map = { lan_vocabs[i]:i+1 for i in range(len(lan_vocabs))}
train_set['language'] = train_set['language'].map(lan_map).fillna(3).astype(int)
test_set['language'] = test_set['language'].map(lan_map).fillna(3).astype(int)

NameError: name 'train' is not defined

In [6]:
#Shuffle the data and split off 20% of the training set for use as a validation set

split_ratio = 0.8

train_set = train_set.sample(frac=1, random_state=6)
val_set = train_set[int(split_ratio*train_set.shape[0]):]
train_set = train_set[:int(split_ratio*train_set.shape[0])]

#Separate the labels from the training and validation sets

y_train = train_set['target']
train_set.drop('target', axis=1, inplace=True)

y_val = val_set['target']
val_set.drop('target', axis=1, inplace=True)


NameError: name 'train_set' is not defined

In [7]:
#Designate the target feature name and the features to be used in the dataset

FEATURES = ['msno', 'gender', 'city', 'bd', 'registered_via',
            'song_id', 'artist_name', 'song_length', 'language', 'genre_ids',
            'source_system_tab', 'source_screen_name', 'source_type', 'reg_duration']
Types = tuple([tf.int32]*14+[tf.float32])
LABEL = 'target'

#Use the feature_column module to input each feature column into the model

target = tf.feature_column.categorical_column_with_identity(key='target', num_buckets=2)

length = tf.feature_column.numeric_column(key='song_length',
                                          default_value=-1,
                                          dtype=tf.int32)

duration = tf.feature_column.numeric_column(key='reg_duration',
                                            default_value=-1,
                                            dtype=tf.int32)

registered = tf.feature_column.categorical_column_with_vocabulary_list(key='registered_via',
                                                                       vocabulary_list=train_set['registered_via'].unique(),
                                                                       dtype=tf.int32,
                                                                       default_value=-99)

gender = tf.feature_column.categorical_column_with_vocabulary_list(key='gender',
                                                                   vocabulary_list=train_set['gender'].unique(),
                                                                   dtype=tf.int32,
                                                                   default_value=-99)

city = tf.feature_column.categorical_column_with_vocabulary_list(key='city',
                                                          vocabulary_list=train_set['city'].unique(),
                                                          dtype=tf.int32,
                                                          default_value=-99)

language = tf.feature_column.categorical_column_with_vocabulary_list(key='language',
                                                                     vocabulary_list=train_set['language'].unique(),
                                                                     dtype=tf.int32,
                                                                     default_value=-99)

artist = tf.feature_column.categorical_column_with_vocabulary_list(key='artist_name',
                                                                   vocabulary_list=train_set['artist_name'].unique(),
                                                                   dtype=tf.int32,
                                                                   default_value=-99)

tab = tf.feature_column.categorical_column_with_vocabulary_list(key='source_system_tab',
                                                                vocabulary_list=train_set['source_system_tab'].unique(),
                                                                dtype=tf.int32,
                                                                default_value=-99)

screen = tf.feature_column.categorical_column_with_vocabulary_list(key='source_screen_name',
                                                                   vocabulary_list=train_set['source_screen_name'].unique(),
                                                                   dtype=tf.int32,
                                                                   default_value=-99)

source = tf.feature_column.categorical_column_with_vocabulary_list(key='source_type',
                                                                   vocabulary_list=train_set['source_type'].unique(),
                                                                   dtype=tf.int32,
                                                                   default_value=-99)

#Bucket categorical features with many unique categories using a hash table with a size of approximately (n/0.8)*2

msno = tf.feature_column.categorical_column_with_hash_bucket(key='msno',
                                                               hash_bucket_size=90000,
                                                               dtype=tf.int32)

song_id = tf.feature_column.categorical_column_with_hash_bucket(key='song_id',
                                                             hash_bucket_size=6000000,
                                                             dtype=tf.int32)

genre = tf.feature_column.categorical_column_with_vocabulary_list(key='genre_ids',
                                                                  vocabulary_list=train_set['genre_ids'].unique(),
                                                                  dtype=tf.int32,
                                                                  default_value=-99)

hashed_genre = tf.feature_column.categorical_column_with_hash_bucket(key='genre_ids',
                                                                     hash_bucket_size=3000,
                                                                     dtype=tf.int32)

#Perform one hot encoding on categorical features with few unique values

indicator_registered = tf.feature_column.indicator_column(registered)
indicator_gender = tf.feature_column.indicator_column(gender)
indicator_city = tf.feature_column.indicator_column(city)
indicator_genre = tf.feature_column.indicator_column(genre)
indicator_language = tf.feature_column.indicator_column(language)
indicator_tab = tf.feature_column.indicator_column(tab)
indicator_screen = tf.feature_column.indicator_column(screen)
indicator_source = tf.feature_column.indicator_column(source)

#Embed the categorical feature with <100 unique categories into dense vectors with approximately log2(n) dimensions

embedded_genre = tf.feature_column.embedding_column(genre, dimension=10)
embedded_song = tf.feature_column.embedding_column(song_id, dimension=22)
embedded_msno = tf.feature_column.embedding_column(msno, dimension=15)
embedded_artist = tf.feature_column.embedding_column(artist, dimension=15)

#Bucket member age into age ranges, with nonsensical values going into the 0-14 or the >80 buckets

age = tf.feature_column.numeric_column(key='bd',
                                       default_value=0,
                                       dtype=tf.int32)

age_bucket = tf.feature_column.bucketized_column(age, boundaries=[0, 14, 20, 30, 40, 50, 80])

#Assign features to be used in either the wide or the deep model (or both)

wide_columns = []
cross_columns = []
deep_columns = [
                indicator_gender, indicator_city, indicator_language, indicator_tab,
                indicator_screen, indicator_source, indicator_registered,
                embedded_msno, embedded_song, embedded_genre,
                #embedded_artist,
                duration,
                #length, age_bucket
                ]


NameError: name 'train_set' is not defined

In [8]:
def build_estimator(model_dir, model_type):
    if model_type == 'wide':
        model = tf.estimator.LinearClassifier(model_dir=model_dir,
                                              feature_columns=wide_columns + cross_columns)

    elif model_type == 'deep':
        model = tf.estimator.DNNClassifier(model_dir=model_dir,
                                           feature_columns=deep_columns,
                                           hidden_units=[1024, 512, 256],
                                           optimizer=tf.train.AdamOptimizer(learning_rate=0.001,
                                                                            name='Adam'))

    elif model_type == 'combined':
        model = tf.estimator.DNNLinearCombinedClassifier(model_dir=model_dir,
                                                         linear_feature_columns=cross_columns,
                                                         dnn_feature_columns=deep_columns,
                                                         dnn_hidden_units=[100, 50])

    return model


In [9]:
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Methods to allow pandas.DataFrame."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from tensorflow.python.estimator.inputs.queues import feeding_functions

try:
  # pylint: disable=g-import-not-at-top
  # pylint: disable=unused-import
  import pandas as pd
  HAS_PANDAS = True
except IOError:
  # Pandas writes a temporary file during import. If it fails, don't use pandas.
  HAS_PANDAS = False
except ImportError:
  HAS_PANDAS = False


def pandas_input_fn(x,
                    y=None,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=None,
                    queue_capacity=1000,
                    num_threads=1,
                    target_column='target'):
  """Returns input function that would feed Pandas DataFrame into the model.
  Note: `y`'s index must match `x`'s index.
  Args:
    x: pandas `DataFrame` object.
    y: pandas `Series` object. `None` if absent.
    batch_size: int, size of batches to return.
    num_epochs: int, number of epochs to iterate over data. If not `None`,
      read attempts that would exceed this value will raise `OutOfRangeError`.
    shuffle: bool, whether to read the records in random order.
    queue_capacity: int, size of the read queue. If `None`, it will be set
      roughly to the size of `x`.
    num_threads: Integer, number of threads used for reading and enqueueing. In
      order to have predicted and repeatable order of reading and enqueueing,
      such as in prediction and evaluation mode, `num_threads` should be 1.
    target_column: str, name to give the target column `y`.
  Returns:
    Function, that has signature of ()->(dict of `features`, `target`)
  Raises:
    ValueError: if `x` already contains a column with the same name as `y`, or
      if the indexes of `x` and `y` don't match.
    TypeError: `shuffle` is not bool.
  """
  if not HAS_PANDAS:
    raise TypeError(
        'pandas_input_fn should not be called without pandas installed')

  if not isinstance(shuffle, bool):
    raise TypeError('shuffle must be explicitly set as boolean; '
                    'got {}'.format(shuffle))

  x = x.copy()
  if y is not None:
    if target_column in x:
      raise ValueError(
          'Cannot use name %s for target column: DataFrame already has a '
          'column with that name: %s' % (target_column, x.columns))
    if not np.array_equal(x.index, y.index):
      raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n'
                       'Index for y: %s\n' % (x.index, y.index))
    x[target_column] = y

  # TODO(mdan): These are memory copies. We probably don't need 4x slack space.
  # The sizes below are consistent with what I've seen elsewhere.
  if queue_capacity is None:
    if shuffle:
      queue_capacity = 4 * len(x)
    else:
      queue_capacity = len(x)
  min_after_dequeue = max(queue_capacity / 4, 1)

  def input_fn():
    """Pandas input function."""
    queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
        x,
        queue_capacity,
        shuffle=shuffle,
        min_after_dequeue=min_after_dequeue,
        num_threads=num_threads,
        enqueue_size=batch_size,
        num_epochs=num_epochs)
    if num_epochs is None:
      features = queue.dequeue_many(batch_size)
    else:
      features = queue.dequeue_up_to(batch_size)
    assert len(features) == len(x.columns) + 1, ('Features should have one '
                                                 'extra element for the index.')
    features = features[1:]
    features = dict(zip(list(x.columns), features))
    if y is not None:
      target = features.pop(target_column)
      print(features) 
      return features, target
    return features
  return input_fn


In [10]:
def input_fn(X, y, mode, batch_size):
    X.fillna(value='unknown', axis=1, inplace=True)    

    if mode == 'train':
        """return pandas_input_fn(x=pd.DataFrame({k: X[k].values for k in FEATURES}),
                                                    y=pd.Series(y.values),
                                                    batch_size=batch_size,
                                                    num_epochs=None,
                                                    shuffle=True,
                                                    num_threads=8,
                                                    target_column='target')"""
        return get_train_input_fn(X,FEATURES,Types,y)
    elif mode == 'eval':
        return tf.estimator.inputs.pandas_input_fn(x = pd.DataFrame({k: X[k].values for k in FEATURES}),
                                                    y = pd.Series(y.values),
                                                    batch_size=batch_size,
                                                    num_epochs=1,
                                                    shuffle=False,
                                                    num_threads=1,
                                                    target_column='target')
    
    elif mode == 'predict':
        return tf.estimator.inputs.pandas_input_fn(x=pd.DataFrame({k: X[k].values for k in FEATURES}),
                                                    batch_size=batch_size,
                                                    num_epochs=1,
                                                    shuffle=False,num_threads=1)
    

In [11]:
def make_generator(Input):

    def _generator():
        for ipt in zip(*Input):
            yield ipt

    return _generator
def get_train_input_fn(X,FEATURES,Types,Y,batchsize = 100):
    def train_input_fn():
        Input = [X[k].values for k in FEATURES]
        Input.append(Y.values)
        print(Input)
        #Types.append(tf.float32)
        dataset = tf.data.Dataset.from_generator(
              make_generator(Input),
              Types)
        dataset.repeat(None)
        batched_dataset = dataset.batch(batchsize)
        iterator = batched_dataset.make_one_shot_iterator()

        ipt = list(iterator.get_next())
        y = ipt[-1]
        dic = {}
        for i in range(len(FEATURES)):
            ipt[i].set_shape((batchsize,))
            dic[FEATURES[i]] = ipt[i]
        print (dic,ipt)
        y.set_shape((batchsize,))
        return dic, tf.expand_dims(y,axis=1)
    return train_input_fn

In [12]:
def train_model(model_dir, model_type, train_steps, X_train, y_train, X_test, y_test, batch_size):

#Create a temporary directory to store the model if no model directory argument is given

    model_dir = tempfile.mkdtemp() if not model_dir else model_dir
    
    print('build_estimator')
    model = build_estimator(model_dir, model_type)
    
    print('train start')
    
    model.train(input_fn=input_fn(X_train, y_train, mode='train', batch_size=batch_size),
                max_steps=train_steps)
    
#Evaluate the trained model on a separate validation set in n/batch_size steps
    
    model.evaluate(input_fn=input_fn(X_test, y_test, mode='eval', batch_size=batch_size),
                        steps=(X_test.shape[0]//batch_size + 1))

    print('end!')
    
    return model
    

In [13]:
deep_model = train_model(model_dir='model/', model_type='deep', train_steps=600000,
                         X_train=train_set, y_train=y_train,
                         X_test=val_set, y_test=y_val,
                         batch_size=100)


NameError: name 'train_set' is not defined

In [14]:
results = deep_model.evaluate(input_fn=input_fn(train_set, y_train, mode='eval', batch_size=1000),
                              steps=1476)


NameError: name 'deep_model' is not defined

In [15]:
predictions = deep_model.predict(input_fn=input_fn(test_set, None, mode='predict',
                                                   batch_size=10000))

submission = list()

for row in predictions:
    submission.append(row['probabilities'][1])

pd.DataFrame(data={'id': ids,
                   'target': np.array(submission)}).to_csv('submissions/embedded_deep.csv',
                                                           header=['id', 'target'],
                                                           index=False)


NameError: name 'deep_model' is not defined