In [8]:
import pandas as pd

path_to_train = './data/train.csv'
path_to_test = './data/test.csv'

train = pd.read_csv(path_to_train, encoding='utf-8', usecols=['release_date'])
test = pd.read_csv(path_to_test, encoding='utf-8', usecols=['release_date'])

df = pd.concat([train, test], ignore_index=True)
df.tail()

Unnamed: 0,release_date
7578747,20161014
7578748,20161014
7578749,19910101
7578750,19910101
7578751,19910101


In [9]:
from datetime import date
import matplotlib.pyplot as plt
%matplotlib inline

def extract_date(date_int):
    date_str = str(date_int)
    year = int(date_str[0:4])
    month = int(date_str[4:6])
    day = int(date_str[6:8])
    return date(year, month, day)

df['release_date_dt'] = df.apply(lambda row: extract_date(row['release_date']),axis=1)
df.head()

Unnamed: 0,release_date,release_date_dt
0,20040704,2004-07-04
1,20060301,2006-03-01
2,20140714,2014-07-14
3,20001030,2000-10-30
4,20080215,2008-02-15


In [10]:
df.ix[df['release_date'] > 30000000,'release_date_dt'] = date(2000,1,1)
df.ix[df['release_date'] > 30000000,'release_date'] = 20000101

In [12]:
rl_f = df.apply(lambda row: (row['release_date_dt'].year, row['release_date_dt'].month, row['release_date_dt'].day, int((row['release_date_dt'].year-1900)/10)), axis=1)

In [15]:
ftrs = pd.DataFrame(list(rl_f), columns=['release_date_year', 'release_date_month', 'release_date_day', 'release_date_decade'])
df = pd.concat([df, ftrs], axis=1)
df.tail(10)

Unnamed: 0,release_date,release_date_dt,release_date_year,release_date_month,release_date_day,release_date_decade
0,20040704,2004-07-04,2004,7,4,10
1,20060301,2006-03-01,2006,3,1,10
2,20140714,2014-07-14,2014,7,14,11
3,20001030,2000-10-30,2000,10,30,10
4,20080215,2008-02-15,2008,2,15,10
5,20080215,2008-02-15,2008,2,15,10
6,20080215,2008-02-15,2008,2,15,10
7,20080215,2008-02-15,2008,2,15,10
8,20080215,2008-02-15,2008,2,15,10
9,20080215,2008-02-15,2008,2,15,10


In [2]:
path_to_release_date_train = './data/features/release_date.train.txt'
path_to_release_date_test = './data/features/release_date.test.txt'
newcols = ['release_date_year','release_date_month','release_date_day','release_date_decade']

In [18]:
train_dt = df.loc[range(len(train))][newcols]
train_dt.head()

Unnamed: 0,release_date_year,release_date_month,release_date_day,release_date_decade
0,2004,7,4,10
1,2006,3,1,10
2,2014,7,14,11
3,2000,10,30,10
4,2008,2,15,10


In [19]:
test_dt = df.loc[range(len(train), len(train)+len(test))][newcols]
test_dt.head()

Unnamed: 0,release_date_year,release_date_month,release_date_day,release_date_decade
7558834,2002,10,8,10
7558835,1985,12,31,8
7558836,1985,12,31,8
7558837,1985,12,31,8
7558838,1985,12,31,8


In [20]:
train_dt.to_csv(path_to_release_date_train, encoding='utf-8', index=False)

In [21]:
test_dt.to_csv(path_to_release_date_test, encoding='utf-8', index=False)

------------------

In [4]:
# я хочу ще згенерити кількість, скільки користувач перед тим слухав записів цього року і цієї декади
import pandas as pd

path_to_train = './data/train.csv'
path_to_test = './data/test.csv'

train = pd.read_csv(path_to_train, encoding='utf-8', usecols=['user_id', 'ts_listen', 'is_listened'])
train = pd.concat([train, pd.read_csv(path_to_release_date_train, encoding='utf-8')],axis=1)

test = pd.read_csv(path_to_test, encoding='utf-8', usecols=['user_id', 'ts_listen'])
test['is_listened'] = 0

test = pd.concat([test, pd.read_csv(path_to_release_date_test, encoding='utf-8')],axis=1)

df = pd.concat([train, test], ignore_index=True)
df.tail()

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade
7578747,1479664431,12641,0,2016,10,14,11
7578748,1480358099,10055,0,2016,10,14,11
7578749,1480616853,1029,0,1991,1,1,9
7578750,1479644510,4630,0,1991,1,1,9
7578751,1480491885,6467,0,1991,1,1,9


In [25]:
df = df.sort_values(by=['user_id', 'release_date_decade', 'ts_listen'], axis=0)
df.head()

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade
829179,1478333098,0,1,1945,1,1,4,
829180,1478358099,0,1,1945,1,1,4,
829181,1478359301,0,1,1945,1,1,4,
829178,1478359883,0,1,1945,1,1,4,
829182,1478362962,0,1,1945,1,1,4,


In [27]:
df['release_date_listened_this_decade'] = df['is_listened'].groupby((
                (df[['user_id', 'release_date_decade']].shift() != df[['user_id', 'release_date_decade']]).any(axis=1)
            ).cumsum()).cumcount() + 1

df.head(20)

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade
829179,1478333098,0,1,1945,1,1,4,1
829180,1478358099,0,1,1945,1,1,4,2
829181,1478359301,0,1,1945,1,1,4,3
829178,1478359883,0,1,1945,1,1,4,4
829182,1478362962,0,1,1945,1,1,4,5
829191,1478702580,0,1,1945,1,1,4,6
307318,1478295198,0,1,1958,9,24,5,1
6372249,1478371200,0,1,1958,12,31,5,2
303096,1478798499,0,1,1958,9,24,5,3
1247753,1478848107,0,1,1958,9,24,5,4


In [28]:
df = df.sort_values(by=['user_id', 'release_date_year', 'ts_listen'], axis=0)
df.head()

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade
829179,1478333098,0,1,1945,1,1,4,1
829180,1478358099,0,1,1945,1,1,4,2
829181,1478359301,0,1,1945,1,1,4,3
829178,1478359883,0,1,1945,1,1,4,4
829182,1478362962,0,1,1945,1,1,4,5


In [29]:
df['release_date_listened_this_year'] = df['is_listened'].groupby((
                (df[['user_id', 'release_date_year']].shift() != df[['user_id', 'release_date_year']]).any(axis=1)
            ).cumsum()).cumcount() + 1

df.head(20)

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade,release_date_listened_this_year
829179,1478333098,0,1,1945,1,1,4,1,1
829180,1478358099,0,1,1945,1,1,4,2,2
829181,1478359301,0,1,1945,1,1,4,3,3
829178,1478359883,0,1,1945,1,1,4,4,4
829182,1478362962,0,1,1945,1,1,4,5,5
829191,1478702580,0,1,1945,1,1,4,6,6
307318,1478295198,0,1,1958,9,24,5,1,1
6372249,1478371200,0,1,1958,12,31,5,2,2
303096,1478798499,0,1,1958,9,24,5,3,3
1247753,1478848107,0,1,1958,9,24,5,4,4


In [30]:
user_count = df.groupby('user_id', as_index=False).agg({'ts_listen':'count'})
user_count['count'] = user_count['ts_listen']
del user_count['ts_listen']
df = df.join(user_count, on='user_id', rsuffix='_r')
df.head()

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade,release_date_listened_this_year,user_id_r,count
0,1480597215,9241,0,2004,7,4,10,78,32,9241,227
1,1480544735,16547,1,2006,3,1,10,4,1,16547,41
2,1479563953,7665,1,2014,7,14,11,157,20,7665,303
3,1480152098,1580,0,2000,10,30,10,42,3,1580,1076
4,1478368974,1812,1,2008,2,15,10,47,5,1812,999


In [31]:
df['release_date_listened_this_decade'] = df['release_date_listened_this_decade']/df['count']
df['release_date_listened_this_year'] = df['release_date_listened_this_year']/df['count']
del df['user_id_r']
del df['count']

In [32]:
df = df.sort_index()

In [33]:
df.tail()

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade,release_date_listened_this_year
7578747,1479664431,12641,0,2016,10,14,11,0.87069,0.525862
7578748,1480358099,10055,0,2016,10,14,11,0.979487,0.779487
7578749,1480616853,1029,0,1991,1,1,9,0.152223,0.001507
7578750,1479644510,4630,0,1991,1,1,9,0.05283,0.001887
7578751,1480491885,6467,0,1991,1,1,9,0.055703,0.002653


In [1]:
path_to_release_date_train = './data/features/release_date.train.txt'
path_to_release_date_test = './data/features/release_date.test.txt'
newcols = ['release_date_year','release_date_month','release_date_day','release_date_decade', 
           'release_date_listened_this_decade','release_date_listened_this_year']

In [None]:
train_dt = df.loc[range(len(train))][newcols]
train_dt.to_csv(path_to_release_date_train, encoding='utf-8', index=False)

test_dt = df.loc[range(len(train), len(train)+len(test))][newcols]
test_dt.to_csv(path_to_release_date_test, encoding='utf-8', index=False)

----------------

In [2]:
import validation as v
import pandas as pd

path_to_train = './data/train.csv'

train = pd.read_csv(path_to_train, encoding='utf-8')

train = pd.concat([train, pd.read_csv(path_to_release_date_train, encoding='utf-8')], axis=1)
train.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,user_id,artist_id,user_age,is_listened,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade,release_date_listened_this_year
0,25471,1480597215,222606,41774,12,20040704,1,0,223,0,...,9241,55164,29,0,2004,7,4,10,0.343612,0.140969
1,25571,1480544735,250467,43941,0,20060301,2,1,171,0,...,16547,55830,30,1,2006,3,1,10,0.097561,0.02439
2,16,1479563953,305197,48078,1,20140714,2,1,149,1,...,7665,2704,29,1,2014,7,14,11,0.518152,0.066007
3,7,1480152098,900502,71521,0,20001030,0,0,240,0,...,1580,938,30,0,2000,10,30,10,0.039033,0.002788
4,7,1478368974,542335,71718,0,20080215,0,0,150,0,...,1812,2939,24,1,2008,2,15,10,0.047047,0.005005


In [3]:
path_to_folds = './data/lightgbm'

In [4]:
v.generate_validation_folds(train, path_to_folds)

Fold 1


In [5]:
import model_lgbm as mlgbm

cols = [
        'genre_id', 'media_id', 'album_id', 'context_type',
        'release_date', 'platform_name', 'platform_family', 'media_duration',
        'listen_type', 'user_gender', 'user_id', 'artist_id', 'user_age', 
       ] + newcols

params = {
    'application':'binary',
    'num_leaves': 31,
    'max_depth': 20,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'max_bin': 200,
    'metric': 'auc',
    'verbose': 1
}

In [6]:
evs = mlgbm.crossvalidate_model(path_to_folds,cols,params,validate_on_both=True)

[1]	training's auc: 0.672938	valid_1's auc: 0.558629
[2]	training's auc: 0.676314	valid_1's auc: 0.564966
[3]	training's auc: 0.677163	valid_1's auc: 0.56659
[4]	training's auc: 0.678667	valid_1's auc: 0.56105
[5]	training's auc: 0.679728	valid_1's auc: 0.559787
[6]	training's auc: 0.680124	valid_1's auc: 0.56142
[7]	training's auc: 0.681024	valid_1's auc: 0.564091
[8]	training's auc: 0.680954	valid_1's auc: 0.564038
[9]	training's auc: 0.6811	valid_1's auc: 0.56343
[10]	training's auc: 0.681078	valid_1's auc: 0.562148
[11]	training's auc: 0.681641	valid_1's auc: 0.562456
[12]	training's auc: 0.682176	valid_1's auc: 0.563342
[13]	training's auc: 0.68236	valid_1's auc: 0.562852
[14]	training's auc: 0.683178	valid_1's auc: 0.563253
[15]	training's auc: 0.683953	valid_1's auc: 0.564359
[16]	training's auc: 0.684275	valid_1's auc: 0.565012
[17]	training's auc: 0.684884	valid_1's auc: 0.565649
[18]	training's auc: 0.685483	valid_1's auc: 0.566092
[19]	training's auc: 0.686013	valid_1's auc:

KeyboardInterrupt: 

In [8]:
import pandas as pd

path_to_sample = './data/sample_submission_kaggle.csv'
path_to_submission = './data/submission.csv'
path_to_test = './data/test.csv'

test = pd.read_csv(path_to_test, encoding='utf-8')
test = pd.concat([test, pd.read_csv(path_to_release_date_test, encoding='utf-8')], axis=1)

test.head()

Unnamed: 0,sample_id,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,...,user_gender,user_id,artist_id,user_age,release_date_year,release_date_month,release_date_day,release_date_decade,release_date_listened_this_decade,release_date_listened_this_year
0,0,50,1478104371,683078,82356,1,20021008,0,0,542,...,0,17698,2076,30,2002,10,8,10,0.538462,0.115385
1,1,2744,1479317140,876497,99692,1,19851231,0,0,307,...,0,10525,26,28,1985,12,31,8,0.016854,0.011236
2,2,2744,1479546361,876497,99692,1,19851231,0,0,307,...,0,8716,26,27,1985,12,31,8,0.004,0.004
3,3,2744,1478457729,876500,99692,1,19851231,2,1,265,...,0,5443,26,30,1985,12,31,8,0.044053,0.008811
4,4,2744,1480448560,876504,99692,1,19851231,2,1,356,...,0,7600,26,29,1985,12,31,8,0.094771,0.068627


In [10]:
fs = mlgbm.create_submission(train, test, path_to_sample, params, cols, path_to_submission)

In [12]:
mlgbm.feature_score(fs,cols)

[('user_id', 6422),
 ('user_age', 3412),
 ('context_type', 3310),
 ('artist_id', 2079),
 ('release_date_listened_this_decade', 1631),
 ('release_date', 1625),
 ('release_date_listened_this_year', 1458),
 ('media_duration', 1284),
 ('media_id', 1283),
 ('genre_id', 1238),
 ('platform_name', 1132),
 ('album_id', 1038),
 ('user_gender', 929),
 ('platform_family', 849),
 ('release_date_day', 784),
 ('listen_type', 762),
 ('release_date_month', 415),
 ('release_date_year', 329),
 ('release_date_decade', 20)]