In [1]:
import pandas as pd

path_to_train = './data/train.csv'
path_to_test = './data/test.csv'

categorical_features = ['listen_type', 
                        'genre_id', 
                        'album_id', 
                        'context_type',
                        'platform_name',
                        'artist_id']

train = pd.read_csv(path_to_train, encoding='utf-8', usecols=['user_id', 'ts_listen', 'is_listened'] + categorical_features)
test = pd.read_csv(path_to_test, encoding='utf-8', usecols=['user_id', 'ts_listen'] + categorical_features)
test['is_listened'] = 0 # просто щоб зібрати, ми будемо використовувати попередні значення

path_to_test_ts_listen = './data/features/ts_listen.test.csv'
path_to_train_ts_listen = './data/features/ts_listen.train.csv'
categorical_features_time = ['ts_listen_dayofweek', 
                            'ts_listen_hour']

test_ts_listen = pd.read_csv(path_to_test_ts_listen, encoding='utf-8', usecols=categorical_features_time)
train_ts_listen = pd.read_csv(path_to_train_ts_listen, encoding='utf-8', usecols=categorical_features_time)

test = pd.concat([test, test_ts_listen], axis=1)
train = pd.concat([train, train_ts_listen], axis=1)

df = pd.concat([train, test], ignore_index=True)
df.tail()

Unnamed: 0,genre_id,ts_listen,album_id,context_type,platform_name,listen_type,user_id,artist_id,is_listened,ts_listen_dayofweek,ts_listen_hour
7578747,0,1479664431,14226364,5,0,1,12641,9285528,0,6,17
7578748,0,1480358099,14226364,1,0,1,10055,9285528,0,0,18
7578749,0,1480616853,14581358,1,0,1,1029,129,0,3,18
7578750,0,1479644510,14581358,1,0,1,4630,129,0,6,12
7578751,0,1480491885,14581358,1,1,1,6467,129,0,2,7


In [7]:
def mean_cum_by_category_by_user(df, category):
    df.sort_values(by=['user_id', 'ts_listen', category], axis=0, inplace=True)
    
    is_listened_cumsum = df.groupby(['user_id',category]).agg({'is_listened':'cumsum'})['is_listened']
    
    cumcount = df.groupby(['user_id',category]).cumcount() + 1
    
    column_name = category+"_cumulative_mean"
    
    df[column_name] = (is_listened_cumsum/cumcount).shift()
    
    fi = df.groupby(['user_id',category], as_index=False).apply(lambda x: x.index[0])
    
    df.ix[fi, column_name] = 0
    
    return column_name

In [5]:
new_cols = [mean_cum_by_category_by_user(df, category) for category in categorical_features+categorical_features_time]
df.head()

Unnamed: 0,genre_id,ts_listen,album_id,context_type,platform_name,listen_type,user_id,artist_id,is_listened,ts_listen_dayofweek,ts_listen_hour,listen_type_cumulative_mean,genre_id_cumulative_mean,album_id_cumulative_mean,context_type_cumulative_mean,platform_name_cumulative_mean,artist_id_cumulative_mean,ts_listen_dayofweek_cumulative_mean,ts_listen_hour_cumulative_mean
682493,0,1477939775,12811130,0,0,0,0,7508018,1,0,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454939,0,1477982531,12920172,0,0,0,0,1035779,0,1,6,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
5354523,0,1477982561,13947748,0,0,0,0,4331004,0,1,6,0.5,0.5,0.0,0.5,0.5,0.0,0.0,0.0
5899458,0,1477982582,13976722,0,0,0,0,3469,1,1,6,0.333333,0.333333,0.0,0.333333,0.333333,0.0,0.0,0.0
4716574,0,1477982792,13640610,0,0,0,0,282118,1,1,6,0.5,0.5,0.0,0.5,0.5,0.0,0.333333,0.333333


In [6]:
df.sort_index(inplace=True)
df.tail()

Unnamed: 0,genre_id,ts_listen,album_id,context_type,platform_name,listen_type,user_id,artist_id,is_listened,ts_listen_dayofweek,ts_listen_hour,listen_type_cumulative_mean,genre_id_cumulative_mean,album_id_cumulative_mean,context_type_cumulative_mean,platform_name_cumulative_mean,artist_id_cumulative_mean,ts_listen_dayofweek_cumulative_mean,ts_listen_hour_cumulative_mean
7578747,0,1479664431,14226364,5,0,1,12641,9285528,0,6,17,0.513514,0.176471,0.0,0.0,0.208696,0.0,0.2,0.428571
7578748,0,1480358099,14226364,1,0,1,10055,9285528,0,0,18,0.75,1.0,1.0,1.0,0.967742,1.0,1.0,0.970588
7578749,0,1480616853,14581358,1,0,1,1029,129,0,3,18,0.822097,0.911032,0.0,0.822097,0.875566,0.0,0.843902,0.95
7578750,0,1479644510,14581358,1,0,1,4630,129,0,6,12,0.904372,0.9,0.0,0.84058,0.846914,0.0,0.880952,0.878049
7578751,0,1480491885,14581358,1,1,1,6467,129,0,2,7,0.932203,0.887324,0.0,0.932203,0.925532,0.0,0.928571,0.925926


In [7]:
path_to_test_cum_mean = './data/features/cum_mean.test.csv'
path_to_train_cum_mean = './data/features/cum_mean.train.csv'

test_cum_mean = df.iloc[range(len(train), len(train)+len(test))][new_cols]
test_cum_mean.to_csv(path_to_test_cum_mean, encoding='utf-8', index=False)

train_cum_mean = df.iloc[range(len(train))][new_cols]
train_cum_mean.to_csv(path_to_train_cum_mean, encoding='utf-8', index=False)

------

More features

In [4]:
import pandas as pd

path_to_train = './data/train.csv'
path_to_test = './data/test.csv'

train = pd.read_csv(path_to_train, encoding='utf-8', usecols=['user_id', 'ts_listen', 'media_id', 'is_listened'])
test = pd.read_csv(path_to_test, encoding='utf-8', usecols=['user_id', 'ts_listen', 'media_id'])
test['is_listened'] = 0 # просто щоб зібрати, ми будемо використовувати попередні значення

path_to_test_release_date = './data/features/release_date.test.txt'
path_to_train_release_date = './data/features/release_date.train.txt'
categorical_features_release_date = ['release_date_decade', 'release_date_year']

test_release_date = pd.read_csv(path_to_test_release_date, encoding='utf-8', usecols=categorical_features_release_date)
train_release_date = pd.read_csv(path_to_train_release_date, encoding='utf-8', usecols=categorical_features_release_date)

test = pd.concat([test, test_release_date], axis=1)
train = pd.concat([train, train_release_date], axis=1)

df = pd.concat([train, test], ignore_index=True)
df.tail()

Unnamed: 0,ts_listen,media_id,user_id,is_listened,release_date_year,release_date_decade
7578747,1479664431,133549164,12641,0,2016,11
7578748,1480358099,133549164,10055,0,2016,11
7578749,1480616853,136334560,1029,0,1991,9
7578750,1479644510,136334560,4630,0,1991,9
7578751,1480491885,136334560,6467,0,1991,9


In [6]:
# track info categorical features
path_to_test_release_date = './data/features/all_media_info.csv'
track_info = pd.read_csv(path_to_test_release_date, encoding='utf-8', usecols=['Id', 'ExplicitLyrics', 'Isrc_Country', 'Isrc_Label'])

df = pd.merge(df, track_info, left_on='media_id', right_on='Id', how='left', sort=False)
del df['Id']
del df['media_id']
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_decade,ExplicitLyrics,Isrc_Country,Isrc_Label
0,1480597215,9241,0,2004,10,False,FR,6V8
1,1480544735,16547,1,2006,10,False,FR,6V8
2,1479563953,7665,1,2014,11,False,FR,6V8
3,1480152098,1580,0,2000,10,False,NL,A20
4,1478368974,1812,1,2008,10,False,US,SM1


In [8]:
cols = ['release_date_year', 'release_date_decade', 'ExplicitLyrics', 'Isrc_Country', 'Isrc_Label']

new_cols = [mean_cum_by_category_by_user(df, category) for category in cols]

df.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_decade,ExplicitLyrics,Isrc_Country,Isrc_Label,release_date_year_cumulative_mean,release_date_decade_cumulative_mean,ExplicitLyrics_cumulative_mean,Isrc_Country_cumulative_mean,Isrc_Label_cumulative_mean
682493,1477939775,0,1,2016,11,False,DK,4YA,0.0,0.0,0.0,0.0,0.0
4454939,1477982531,0,0,2016,11,False,FR,UM7,1.0,1.0,1.0,0.0,0.0
5354523,1477982561,0,0,2016,11,False,US,SM1,0.5,0.5,0.5,0.0,0.0
5899458,1477982582,0,1,2016,11,False,US,RC1,0.333333,0.333333,0.333333,0.0,0.0
4716574,1477982792,0,1,2016,11,False,QM,UY4,0.5,0.5,0.5,0.0,0.0


In [9]:
df.sort_index(inplace=True)
df.tail()

Unnamed: 0,ts_listen,user_id,is_listened,release_date_year,release_date_decade,ExplicitLyrics,Isrc_Country,Isrc_Label,release_date_year_cumulative_mean,release_date_decade_cumulative_mean,ExplicitLyrics_cumulative_mean,Isrc_Country_cumulative_mean,Isrc_Label_cumulative_mean
7578747,1479664431,12641,0,2016,11,False,GB,2LD,0.116667,0.19,0.22449,0.288462,0.0
7578748,1480358099,10055,0,2016,11,False,GB,2LD,1.0,0.968421,0.967568,0.967742,1.0
7578749,1480616853,1029,0,1991,9,False,US,C4R,0.878378,0.868066,0.777778,0.875,0.0
7578750,1479644510,4630,0,1991,9,False,US,C4R,0.0,0.837838,0.875527,0.864198,0.0
7578751,1480491885,6467,0,1991,9,False,US,C4R,0.0,0.913043,0.931751,0.977273,0.0


In [None]:
path_to_test_cum_mean = './data/features/cum_mean2.test.csv'
path_to_train_cum_mean = './data/features/cum_mean2.train.csv'

test_cum_mean = df.iloc[range(len(train), len(train)+len(test))][new_cols]
test_cum_mean.to_csv(path_to_test_cum_mean, encoding='utf-8', index=False)

train_cum_mean = df.iloc[range(len(train))][new_cols]
train_cum_mean.to_csv(path_to_train_cum_mean, encoding='utf-8', index=False)