In [5]:
import pandas as pd

path_to_train = './data/train.csv'
path_to_test = './data/test.csv'

train = pd.read_csv(path_to_train, encoding='utf-8', usecols=['release_date'])
test = pd.read_csv(path_to_test, encoding='utf-8', usecols=['release_date'])

df = pd.concat([train, test], ignore_index=True)
df.tail()

Unnamed: 0,release_date
7578747,20161014
7578748,20161014
7578749,19910101
7578750,19910101
7578751,19910101


In [6]:
from datetime import date
import matplotlib.pyplot as plt
%matplotlib inline

def extract_date(date_int):
    date_str = str(date_int)
    year = int(date_str[0:4])
    month = int(date_str[4:6])
    day = int(date_str[6:8])
    return date(year, month, day)

df['release_date_dt'] = df.apply(lambda row: extract_date(row['release_date']),axis=1)
df.head()

Unnamed: 0,release_date,release_date_dt
0,20040704,2004-07-04
1,20060301,2006-03-01
2,20140714,2014-07-14
3,20001030,2000-10-30
4,20080215,2008-02-15


In [None]:
df.ix[df['release_date'] > 30000000,'release_date_dt'] = date(2000,1,1)
df.ix[df['release_date'] > 30000000,'release_date'] = 20000101

In [None]:
df['release_date_year'] = df.apply(lambda row: row['release_date_dt'].year, axis=1)
df['release_date_month'] = df.apply(lambda row: row['release_date_dt'].month, axis=1)
df['release_date_day'] = df.apply(lambda row: row['release_date_dt'].day, axis=1)

In [None]:
df['release_date_decade'] = df.apply(lambda row: int((row['release_date_dt'].year-1900)/10), axis=1)

In [None]:
df['release_date_decade'].head(10)

In [None]:
path_to_release_date_train = './data/features/release_date.train.txt'
path_to_release_date_test = './data/features/release_date.test.txt'
newcols = ['release_date_year','release_date_month','release_date_day','release_date_decade']

In [None]:
del df['release_date_dt']
del df['release_date']

In [None]:
train_dt = df.loc[range(len(train))]
train_dt.head()

In [None]:
test_dt = df.loc[range(len(train), len(train)+len(test))]
test_dt.head()

In [None]:
train_dt.to_csv(path_to_release_date_train, encoding='utf-8', index=False)

In [None]:
test_dt.to_csv(path_to_release_date_test, encoding='utf-8', index=False)

In [4]:
import validation as v
import pandas as pd

path_to_train = './data/train.csv'

train = pd.read_csv(path_to_train, encoding='utf-8')

train = pd.concat([train, pd.read_csv(path_to_release_date_train, encoding='utf-8')], axis=1)
train.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade
0,25471,1480597215,222606,41774,12,20040704,1,0,223,0,0,9241,55164,29,0,2004,7,4,10
1,25571,1480544735,250467,43941,0,20060301,2,1,171,0,0,16547,55830,30,1,2006,3,1,10
2,16,1479563953,305197,48078,1,20140714,2,1,149,1,1,7665,2704,29,1,2014,7,14,11
3,7,1480152098,900502,71521,0,20001030,0,0,240,0,1,1580,938,30,0,2000,10,30,10
4,7,1478368974,542335,71718,0,20080215,0,0,150,0,1,1812,2939,24,1,2008,2,15,10


In [None]:
path_to_folds = './data/lightgbm'
v.generate_validation_folds(train, path_to_folds)

In [4]:
import model_lgbm as mlgbm

cols = [
        'genre_id', 'media_id', 'album_id', 'context_type',
        'release_date', 'platform_name', 'platform_family', 'media_duration',
        'listen_type', 'user_gender', 'user_id', 'artist_id', 'user_age', 
       ] + newcols

params = {
    'application':'binary',
    'num_leaves': 31,
    'max_depth': 20,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'max_bin': 200,
    'metric': 'auc',
    'verbose': 1
    
}

In [5]:
evs = mlgbm.crossvalidate_model(path_to_folds,cols,params)

[1]	valid_0's auc: 0.558629
[2]	valid_0's auc: 0.556995
[3]	valid_0's auc: 0.558171
[4]	valid_0's auc: 0.560171
[5]	valid_0's auc: 0.560724
[6]	valid_0's auc: 0.558929
[7]	valid_0's auc: 0.561146
[8]	valid_0's auc: 0.561452
[9]	valid_0's auc: 0.564308
[10]	valid_0's auc: 0.564802
[11]	valid_0's auc: 0.565931
[12]	valid_0's auc: 0.567066
[13]	valid_0's auc: 0.567326
[14]	valid_0's auc: 0.566376
[15]	valid_0's auc: 0.566336
[16]	valid_0's auc: 0.566817
[17]	valid_0's auc: 0.566294
[18]	valid_0's auc: 0.566675
[19]	valid_0's auc: 0.567225
[20]	valid_0's auc: 0.567556
[21]	valid_0's auc: 0.568209
[22]	valid_0's auc: 0.568498
[23]	valid_0's auc: 0.569105
[24]	valid_0's auc: 0.570111
[25]	valid_0's auc: 0.570126
[26]	valid_0's auc: 0.569651
[27]	valid_0's auc: 0.570422
[28]	valid_0's auc: 0.570266
[29]	valid_0's auc: 0.570572
[30]	valid_0's auc: 0.57076
[31]	valid_0's auc: 0.571019
[32]	valid_0's auc: 0.571266
[33]	valid_0's auc: 0.571691
[34]	valid_0's auc: 0.571783
[35]	valid_0's auc: 0.57

KeyboardInterrupt: 

In [3]:
import pandas as pd

path_to_sample = './data/sample_submission_kaggle.csv'
path_to_submission = './data/submission.csv'
path_to_test = './data/test.csv'

test = pd.read_csv(path_to_test, encoding='utf-8')
test = pd.concat([test, pd.read_csv(path_to_release_date_test, encoding='utf-8')], axis=1)

test.head()

Unnamed: 0,sample_id,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,release_date_year,release_date_month,release_date_day,release_date_decade
0,0,50,1478104371,683078,82356,1,20021008,0,0,542,1,0,17698,2076,30,,,,
1,1,2744,1479317140,876497,99692,1,19851231,0,0,307,1,0,10525,26,28,,,,
2,2,2744,1479546361,876497,99692,1,19851231,0,0,307,1,0,8716,26,27,,,,
3,3,2744,1478457729,876500,99692,1,19851231,2,1,265,1,0,5443,26,30,,,,
4,4,2744,1480448560,876504,99692,1,19851231,2,1,356,1,0,7600,26,29,,,,
