In [1]:
import tempfile

import pandas as pd
import csv
import numpy as np
import tensorflow as tf


  return f(*args, **kwds)


In [4]:
#Load data and reduce the number of features for the baseline models

train = pd.read_csv('dataset/train.csv', dtype={'source_system_tab': str})
test = pd.read_csv('dataset/test.csv', dtype={'source_system_tab': str })
members = pd.read_csv('dataset/members.csv', dtype={'msno': str, 'city': str, 'registered_via': str})
songs = pd.read_csv('dataset/songs.csv', dtype={'genre_ids': str, 'language': str, 'song_length': int})

songs.drop(['composer', 'lyricist'], axis=1, inplace=True)

#Infer a missing value based on other features

songs.loc[605127, 'language'] = '31.0'

#Impute missing values

train.fillna(value='unknown', axis=1, inplace=True)
test.fillna(value='unknown', axis=1, inplace=True)
members.fillna(value='unknown', axis=1, inplace=True)
songs.fillna(value='unknown', axis=1, inplace=True)


In [3]:
#members['registration_init_time'].sort_values()
#members['expiration_date']

In [5]:
#Create lists of genre ids in the genre id column

#genres = songs['genre_ids'].str.split('|')

#Create a dataframe that stores genre IDs across multiple columns (one genre per column)

#genres = genres.apply(pd.Series).add_prefix('genre_')
#genres.to_csv('dataset/genres.csv', index=False)

genres = pd.read_csv('dataset/genres.csv', dtype=str)


In [6]:
#Merge the training and test data with song and member data

train_set = train.merge(songs, on='song_id')
train_set = train_set.merge(members, on='msno')
test_set = test.merge(songs, on='song_id', how='left')
test_set = test_set.merge(members, on='msno', how='left')

#Separate the submission ids from the test set

ids = test_set['id']
test_set.drop('id', axis=1, inplace=True)

#Impute missing values in merged training and test sets

train_set.fillna(value='unknown', axis=1, inplace=True)
test_set.fillna(value='unknown', axis=1, inplace=True)

#Impute missing song lengths with an integer value to avoid errors due to conflicting data types

test_set['song_length'] = test_set['song_length'].replace('unknown', -99)


In [7]:
#Shuffle the data and split off 20% of the training set for use as a validation set

split_ratio = 0.8

train_set = train_set.sample(frac=1, random_state=6)
val_set = train_set[int(split_ratio*train_set.shape[0]):]
train_set = train_set[:int(split_ratio*train_set.shape[0])]

#Separate the labels from the training and validation sets

y_train = train_set['target']
train_set.drop('target', axis=1, inplace=True)

y_val = val_set['target']
val_set.drop('target', axis=1, inplace=True)


In [8]:
#Designate the target feature name and the features to be used in the dataset

FEATURES = ['msno', 'gender', 'city', 'bd', 'registered_via',
            'song_id', 'artist_name', 'song_length', 'language', 'genre_ids',
            'source_system_tab', 'source_screen_name', 'source_type']

LABEL = 'target'

#Use the feature_column module to input each feature column into the model

target = tf.feature_column.categorical_column_with_identity(key='target', num_buckets=2)

registered = tf.feature_column.categorical_column_with_vocabulary_list(key='registered_via',
                                                                       vocabulary_list=['7', '4', '9', '3', '13', '16'],
                                                                       dtype=tf.string,
                                                                       default_value=-99)

gender = tf.feature_column.categorical_column_with_vocabulary_list(key='gender',
                                                                   vocabulary_list=('female', 'male', 'unknown'),
                                                                   dtype=tf.string,
                                                                   default_value=-99)

city = tf.feature_column.categorical_column_with_vocabulary_list(key='city',
                                                          vocabulary_list=members['city'].unique(),
                                                          dtype=tf.string,
                                                          default_value=-99)

language = tf.feature_column.categorical_column_with_vocabulary_list(key='language',
                                                                     vocabulary_list=songs['language'].unique(),
                                                                     dtype=tf.string,
                                                                     default_value=-99)

artist = tf.feature_column.categorical_column_with_vocabulary_list(key='artist_name',
                                                                   vocabulary_list=songs['artist_name'].unique(),
                                                                   dtype=tf.string,
                                                                   default_value=-99)

tab = tf.feature_column.categorical_column_with_vocabulary_list(key='source_system_tab',
                                                                vocabulary_list=train['source_system_tab'].unique(),
                                                                dtype=tf.string,
                                                                default_value=-99)

screen = tf.feature_column.categorical_column_with_vocabulary_list(key='source_screen_name',
                                                                   vocabulary_list=train['source_screen_name'].unique(),
                                                                   dtype=tf.string,
                                                                   default_value=-99)

source = tf.feature_column.categorical_column_with_vocabulary_list(key='source_type',
                                                                   vocabulary_list=train['source_type'].unique(),
                                                                   dtype=tf.string,
                                                                   default_value=-99)

length = tf.feature_column.numeric_column(key='song_length',
                                          default_value=-1,
                                          dtype=tf.int32)

#Bucket categorical features with many unique categories using a hash table with a size of approximately (n/0.8)*2

msno = tf.feature_column.categorical_column_with_hash_bucket(key='msno',
                                                               hash_bucket_size=90000,
                                                               dtype=tf.string)

song_id = tf.feature_column.categorical_column_with_hash_bucket(key='song_id',
                                                             hash_bucket_size=6000000,
                                                             dtype=tf.string)

genre = tf.feature_column.categorical_column_with_vocabulary_list(key='genre_ids',
                                                                  vocabulary_list=genres['genre_0'].unique(),
                                                                  dtype=tf.string,
                                                                  default_value=-99)

hashed_genre = tf.feature_column.categorical_column_with_hash_bucket(key='genre_ids',
                                                                     hash_bucket_size=3000,
                                                                     dtype=tf.string)

#Perform one hot encoding on categorical features with few unique values

indicator_registered = tf.feature_column.indicator_column(registered)
indicator_gender = tf.feature_column.indicator_column(gender)
indicator_city = tf.feature_column.indicator_column(city)
indicator_genre = tf.feature_column.indicator_column(genre)
indicator_language = tf.feature_column.indicator_column(language)
indicator_tab = tf.feature_column.indicator_column(tab)
indicator_screen = tf.feature_column.indicator_column(screen)
indicator_source = tf.feature_column.indicator_column(source)

#Embed the categorical feature with <100 unique categories into dense vectors with approximately log2(n) dimensions

embedded_genre = tf.feature_column.embedding_column(genre, dimension=10)
embedded_song = tf.feature_column.embedding_column(song_id, dimension=22)
embedded_msno = tf.feature_column.embedding_column(msno, dimension=15)
embedded_artist = tf.feature_column.embedding_column(artist, dimension=18)

#Bucket member age into age ranges, with nonsensical values going into the 0-14 or the >80 buckets

age = tf.feature_column.numeric_column(key='bd',
                                       default_value=0,
                                       dtype=tf.int32)

age_bucket = tf.feature_column.bucketized_column(age, boundaries=[0, 14, 20, 30, 40, 50, 80])

#Assign features to be used in either the wide or the deep model (or both)

wide_columns = []
cross_columns = []
deep_columns = [
                indicator_gender, indicator_city, indicator_language, indicator_tab,
                indicator_screen, indicator_source, indicator_registered,
                embedded_msno, embedded_song, embedded_artist, embedded_genre,
                #length, age_bucket
                ]


In [9]:
def build_estimator(model_dir, model_type):
    if model_type == 'wide':
        model = tf.estimator.LinearClassifier(model_dir=model_dir,
                                              feature_columns=wide_columns + cross_columns)

    elif model_type == 'deep':
        model = tf.estimator.DNNClassifier(model_dir=model_dir,
                                           feature_columns=deep_columns,
                                           hidden_units=[1024, 512, 256],
                                           optimizer=tf.train.AdamOptimizer(learning_rate=0.001,
                                                                            name='Adam'))

    elif model_type == 'combined':
        model = tf.estimator.DNNLinearCombinedClassifier(model_dir=model_dir,
                                                         linear_feature_columns=cross_columns,
                                                         dnn_feature_columns=deep_columns,
                                                         dnn_hidden_units=[100, 50])

    return model


In [10]:
def input_fn(X, y, mode, batch_size):
    print(X.shape)
    X.fillna(value='unknown', axis=1, inplace=True)    

    if mode == 'train':
        return tf.estimator.inputs.pandas_input_fn(x=pd.DataFrame({k: X[k].values for k in FEATURES}),
                                                   y=pd.Series(y.values),
                                                   batch_size=batch_size,
                                                   num_epochs=None,
                                                   shuffle=True,
                                                   num_threads=8,
                                                   target_column='target')
    
    elif mode == 'eval':
        return tf.estimator.inputs.pandas_input_fn(x = pd.DataFrame({k: X[k].values for k in FEATURES}),
                                                   y = pd.Series(y.values),
                                                   batch_size=batch_size,
                                                   num_epochs=1,
                                                   shuffle=False,
                                                   num_threads=1,
                                                   target_column='target')
    
    elif mode == 'predict':
        return tf.estimator.inputs.pandas_input_fn(x=pd.DataFrame({k: X[k].values for k in FEATURES}),
                                                   batch_size=batch_size,
                                                   num_epochs=1,
                                                   shuffle=False,
                                                   num_threads=1)
    

In [11]:
def train_model(model_dir, model_type, train_steps, X_train, y_train, X_test, y_test, batch_size):

#Create a temporary directory to store the model if no model directory argument is given

    model_dir = tempfile.mkdtemp() if not model_dir else model_dir
    
    print('build_estimator')
    model = build_estimator(model_dir, model_type)
    
    print('train start')
    model.train(input_fn=input_fn(X_train, y_train, mode='train', batch_size=batch_size),
                max_steps=train_steps)
    
#Evaluate the trained model on a separate validation set in n/batch_size steps
    
    model.evaluate(input_fn=input_fn(X_test, y_test, mode='eval', batch_size=batch_size),
                        steps=(X_test.shape[0]//batch_size + 1))

    print('end!')
    
    return model
    

In [None]:
deep_model = train_model(model_dir='model/', model_type='deep', train_steps=500000,
                         X_train=train_set, y_train=y_train,
                         X_test=val_set, y_test=y_val,
                         batch_size=100)


build_estimator
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f36fc1a5400>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
train start
(5901843, 15)
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into model/model.ckpt.
INFO:tensorflow:loss = 69.5449, step = 1
INFO:tensorflow:global_step/sec: 11.1373
INFO:tensorflow:loss = 63.3456, step = 101 (8.980 sec)
INFO:tensorflow:global_step/sec: 11.4524
INFO:tensorflow:loss = 67.5988, step = 201 (8.732 sec)
INFO:tensorflow:global_step/sec: 11.4179
INFO:te

In [11]:
results = deep_model.evaluate(input_fn=input_fn(train_set, y_train, mode='eval', batch_size=1000),
                              steps=1476)


INFO:tensorflow:Starting evaluation at 2017-11-21-09:56:17
INFO:tensorflow:Restoring parameters from model/model.ckpt-600000
INFO:tensorflow:Evaluation [1/1476]
INFO:tensorflow:Evaluation [2/1476]
INFO:tensorflow:Evaluation [3/1476]
INFO:tensorflow:Evaluation [4/1476]
INFO:tensorflow:Evaluation [5/1476]
INFO:tensorflow:Evaluation [6/1476]
INFO:tensorflow:Evaluation [7/1476]
INFO:tensorflow:Evaluation [8/1476]
INFO:tensorflow:Evaluation [9/1476]
INFO:tensorflow:Evaluation [10/1476]
INFO:tensorflow:Evaluation [11/1476]
INFO:tensorflow:Evaluation [12/1476]
INFO:tensorflow:Evaluation [13/1476]
INFO:tensorflow:Evaluation [14/1476]
INFO:tensorflow:Evaluation [15/1476]
INFO:tensorflow:Evaluation [16/1476]
INFO:tensorflow:Evaluation [17/1476]
INFO:tensorflow:Evaluation [18/1476]
INFO:tensorflow:Evaluation [19/1476]
INFO:tensorflow:Evaluation [20/1476]
INFO:tensorflow:Evaluation [21/1476]
INFO:tensorflow:Evaluation [22/1476]
INFO:tensorflow:Evaluation [23/1476]
INFO:tensorflow:Evaluation [24/14

In [12]:
predictions = deep_model.predict(input_fn=input_fn(test_set, None, mode='predict',
                                                   batch_size=10000))

submission = list()

for row in predictions:
    submission.append(row['probabilities'][1])

pd.DataFrame(data={'id': ids,
                   'target': np.array(submission)}).to_csv('submissions/benchmark_deep.csv',
                                                           header=['id', 'target'],
                                                           index=False)


INFO:tensorflow:Restoring parameters from model/model.ckpt-600000
