In [1]:
import pandas as pd
from pathlib import Path
data_root = Path('../kkdata3/')
train_source = pd.read_parquet(data_root / 'label_train_source.parquet')
train_target = pd.read_parquet(data_root / 'label_train_target.parquet')
test_source = pd.read_parquet(data_root / 'label_test_source.parquet')
meta_song = pd.read_parquet(data_root / 'meta_song.parquet')
#meta_song_composer = pd.read_parquet(data_root / 'meta_song_composer.parquet')
#meta_song_genre = pd.read_parquet(data_root / 'meta_song_genre.parquet')
#meta_song_lyricist = pd.read_parquet(data_root / 'meta_song_lyricist.parquet')
#meta_song_producer = pd.read_parquet(data_root / 'meta_song_producer.parquet')
#meta_song_titletext = pd.read_parquet(data_root / 'meta_song_titletext.parquet')


train_source.dtypes

session_id          int64
song_id            object
unix_played_at      int64
play_status         int64
login_type          int64
listening_order     int64
dtype: object

In [2]:
# map song_id to song_index to save memory and speed up 
meta_song['song_index'] = meta_song.index
train_source = train_source.merge(meta_song[['song_id', 'song_index']], on='song_id', how='left')
train_target = train_target.merge(meta_song[['song_id', 'song_index']], on='song_id', how='left')
test_source = test_source.merge(meta_song[['song_id', 'song_index']], on='song_id', how='left')
del train_source['song_id']
del train_target['song_id']
del test_source['song_id']

In [3]:
train_source['previous1_song_index'] = train_source.groupby('session_id')['song_index'].shift(1)
train_source['previous2_song_index'] = train_source.groupby('session_id')['song_index'].shift(2)
test_source['previous1_song_index'] = test_source.groupby('session_id')['song_index'].shift(1)
test_source['previous2_song_index'] = test_source.groupby('session_id')['song_index'].shift(2)
history_fields = ['previous1_song_index', 'previous2_song_index', 'song_index']
history=pd.concat([train_source[history_fields], test_source[history_fields]], ignore_index=True)
history.dropna(inplace=True)
history

Unnamed: 0,previous1_song_index,previous2_song_index,song_index
2,753834.0,283767.0,753834
3,753834.0,753834.0,753834
4,753834.0,753834.0,955400
5,955400.0,753834.0,13609
6,13609.0,955400.0,955400
...,...,...,...
14306455,154541.0,125431.0,408927
14306456,408927.0,154541.0,125430
14306457,125430.0,408927.0,490246
14306458,490246.0,125430.0,482103


In [4]:
gram1 = history[['previous1_song_index', 'song_index']].groupby(['previous1_song_index']).value_counts(normalize=True, sort=True)
gram1 = gram1.reset_index().groupby('previous1_song_index').first().reset_index()
gram1

Unnamed: 0,previous1_song_index,song_index,proportion
0,1.0,1014814,1.000000
1,3.0,87846,0.333333
2,4.0,982833,0.111111
3,5.0,5,0.166667
4,6.0,519864,1.000000
...,...,...,...
687932,1030707.0,76930,0.090909
687933,1030708.0,736280,0.183673
687934,1030709.0,915034,0.049451
687935,1030710.0,1030649,0.953488


In [40]:
gram2 = history[['previous2_song_index', 'previous1_song_index', 'song_index']].groupby(['previous2_song_index', 'previous1_song_index']).value_counts(normalize=True, sort=True)
gram2 = gram2.reset_index().groupby(['previous2_song_index','previous1_song_index']).first().reset_index()
gram2

Unnamed: 0,previous2_song_index,previous1_song_index,song_index,proportion
0,1.0,488417.0,697989,1.0
1,1.0,1014814.0,799780,1.0
2,3.0,87846.0,897082,1.0
3,3.0,200709.0,1021838,1.0
4,3.0,401307.0,318166,1.0
...,...,...,...,...
8269367,1030711.0,643112.0,6835,1.0
8269368,1030711.0,936567.0,1023129,1.0
8269369,1030711.0,946599.0,125355,1.0
8269370,1030711.0,967484.0,609494,1.0


In [5]:
train_Y = train_target.pivot_table(index='session_id', columns='listening_order', values='song_index', aggfunc='first', fill_value=0)

In [6]:
train_X = train_source.pivot_table(index='session_id', columns='listening_order', values='song_index', aggfunc='first', fill_value=0)

In [7]:
df = pd.concat([train_X[[19,20]], train_Y], axis=1)
df

listening_order,19,20,21,22,23,24,25
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,194387,722034,624433,901147,810092,74420,250714
2,523383,672083,52476,249524,514925,495260,130336
3,578482,261314,203739,402134,129395,230525,946149
4,238820,709693,453065,709693,453065,709693,453065
5,77515,246245,186586,338120,186586,338120,130652
...,...,...,...,...,...,...,...
715317,198412,422547,615766,560313,553343,624094,258244
715320,967471,970952,784715,745441,716579,732103,732103
715321,277358,522132,681203,90781,692730,469116,592612
715322,247342,832762,953126,766052,82260,126631,82260


In [9]:
gram1_predict = pd.merge(df, gram1, left_on=[20], right_on=['previous1_song_index'], how='left')

In [12]:
(gram1_predict['song_index'] == gram1_predict[21]).mean()

0.1488591704106008

In [23]:
gram1_predict['p'] = gram1_predict['song_index'].fillna(gram1_predict[20]).astype(int).values

In [24]:
(gram1_predict['p'] == gram1_predict[21]).mean()

0.14922963203724188

In [28]:
gram1_predict[[21,22]] == 

(572259, 2)

In [66]:
import rich
r = gram1_predict[[21,22,23,24,25]] == gram1_predict[['p']].values
r[25] = r[25] & ~(r[[21,22,23,24]].any(axis=1))
r[24] = r[24] & ~(r[[21,22,23]].any(axis=1))
r[23] = r[23] & ~(r[[21,22]].any(axis=1))
r[22] = r[22] & ~(r[[21]].any(axis=1))
w = [1., 0.63, 0.5, 0.43, 0.38]
all_songs = meta_song.shape[0]
dcg = r.mean() @ w
predict_songs = gram1_predict['p'].sample(frac=0.25).nunique()
fill_songs = 4*test_source['session_id'].nunique()
coverage = (predict_songs+fill_songs)/all_songs
rich.print({'dcg':dcg, 'coverage':coverage, 'total':dcg*0.8+coverage*0.2})

In [None]:
for k in ['song_index', 'previous1_song_index', 'proportion']:
    del gram1_predict[k]
gram1_predict

In [52]:
gram2_predict = pd.merge(gram1_predict, gram2, left_on=[19,20], right_on=['previous2_song_index', 'previous1_song_index'], how='left')
for k in ['previous2_song_index', 'previous1_song_index', 'proportion']:
    del gram2_predict[k]
gram2_predict

Unnamed: 0,19,20,21,22,23,24,25,p,song_index
0,194387,722034,624433,901147,810092,74420,250714,722034,
1,523383,672083,52476,249524,514925,495260,130336,672083,364625.0
2,578482,261314,203739,402134,129395,230525,946149,261314,
3,238820,709693,453065,709693,453065,709693,453065,709693,
4,77515,246245,186586,338120,186586,338120,130652,365352,
...,...,...,...,...,...,...,...,...,...
572254,198412,422547,615766,560313,553343,624094,258244,615766,615766.0
572255,967471,970952,784715,745441,716579,732103,732103,745441,967471.0
572256,277358,522132,681203,90781,692730,469116,592612,340530,
572257,247342,832762,953126,766052,82260,126631,82260,716463,953126.0


In [53]:
(gram2_predict['song_index'] == gram2_predict[21]).mean()

0.2187261362425056

In [60]:
gram2_predict['p2'] = gram2_predict['song_index'].fillna(gram2_predict['p']).astype(int)
gram2_predict

Unnamed: 0,19,20,21,22,23,24,25,p,song_index,p2
0,194387,722034,624433,901147,810092,74420,250714,722034,,722034
1,523383,672083,52476,249524,514925,495260,130336,672083,364625.0,364625
2,578482,261314,203739,402134,129395,230525,946149,261314,,261314
3,238820,709693,453065,709693,453065,709693,453065,709693,,709693
4,77515,246245,186586,338120,186586,338120,130652,365352,,365352
...,...,...,...,...,...,...,...,...,...,...
572254,198412,422547,615766,560313,553343,624094,258244,615766,615766.0,615766
572255,967471,970952,784715,745441,716579,732103,732103,745441,967471.0,967471
572256,277358,522132,681203,90781,692730,469116,592612,340530,,340530
572257,247342,832762,953126,766052,82260,126631,82260,716463,953126.0,953126


In [61]:
(gram2_predict['p2'] == gram2_predict[21]).mean()

0.228959264948214

In [65]:
r = gram2_predict[[21,22,23,24,25]] == gram2_predict[['p2']].values
r[25] = r[25] & ~(r[[21,22,23,24]].any(axis=1))
r[24] = r[24] & ~(r[[21,22,23]].any(axis=1))
r[23] = r[23] & ~(r[[21,22]].any(axis=1))
r[22] = r[22] & ~(r[[21]].any(axis=1))
w = [1., 0.63, 0.5, 0.43, 0.38]
all_songs = meta_song.shape[0]
dcg = r.mean() @ w
predict_songs = gram2_predict['p2'].sample(frac=0.25).nunique()
fill_songs = 4*test_source['session_id'].nunique()
coverage = (predict_songs+fill_songs)/all_songs
rich.print({'dcg':dcg, 'coverage':coverage, 'total':dcg*0.8+coverage*0.2})

In [78]:
cond = (gram2_predict[19] == gram2_predict[20])
w = [1., 0.63, 0.5, 0.43, 0.38]
s = 0
for i in range(22, 26):
    x = gram2_predict[20] == gram2_predict[25]
    s+=(x[cond]).mean()*w[i-21]
print(s)

1.1804716981132077


In [80]:
s*cond.mean()*0.8

0.05492732486513975

In [82]:
(gram2_predict['p2']==gram2_predict[20]) [cond].mean()

0.9067720225934383

In [86]:
gram2_predict['p3'] = gram2_predict['p2'].where(~cond, gram2_predict[20])
gram2_predict

Unnamed: 0,19,20,21,22,23,24,25,p,song_index,p2,p3
0,194387,722034,624433,901147,810092,74420,250714,722034,,722034,722034
1,523383,672083,52476,249524,514925,495260,130336,672083,364625.0,364625,364625
2,578482,261314,203739,402134,129395,230525,946149,261314,,261314,261314
3,238820,709693,453065,709693,453065,709693,453065,709693,,709693,709693
4,77515,246245,186586,338120,186586,338120,130652,365352,,365352,365352
...,...,...,...,...,...,...,...,...,...,...,...
572254,198412,422547,615766,560313,553343,624094,258244,615766,615766.0,615766,615766
572255,967471,970952,784715,745441,716579,732103,732103,745441,967471.0,967471,967471
572256,277358,522132,681203,90781,692730,469116,592612,340530,,340530,340530
572257,247342,832762,953126,766052,82260,126631,82260,716463,953126.0,953126,953126


In [88]:
r = gram2_predict[[21,22,23,24,25]] == gram2_predict[['p3']].values
r[25] = r[25] & ~(r[[21,22,23,24]].any(axis=1))
r[24] = r[24] & ~(r[[21,22,23]].any(axis=1))
r[23] = r[23] & ~(r[[21,22]].any(axis=1))
r[22] = r[22] & ~(r[[21]].any(axis=1))
w = [1., 0.63, 0.5, 0.43, 0.38]
all_songs = meta_song.shape[0]
dcg = r.mean() @ w
predict_songs = gram2_predict['p3'].sample(frac=0.25).nunique()
fill_songs = 4*test_source['session_id'].nunique()
coverage = (predict_songs+fill_songs)/all_songs
rich.print({'dcg':dcg, 'coverage':coverage, 'total':dcg*0.8+coverage*0.2})

In [67]:
df2['song_order'] = df2.iloc[::-1].groupby('session_id')['song_index'].cumcount()
Z = df2.pivot_table(index='session_id', columns='song_order', values='song_index', aggfunc='first', fill_value=-1)
Z

song_order,0,1,2,3,4
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,722034,194387,910165,913263,962778
2,672083,523383,307906,415453,659818
3,261314,578482,606328,133500,332935
4,709693,267047,824980,503008,238820
5,246245,77515,482509,589816,7267
...,...,...,...,...,...
715317,422547,198412,614405,505242,506580
715320,970952,967471,918865,758868,604807
715321,522132,277358,832786,791040,278029
715322,825005,722323,880639,1030620,953126


In [68]:
Z.dtypes

song_order
0    int64
1    int64
2    int64
3    int64
4    int64
dtype: object

In [69]:
df3 = pd.concat([Z[[4,3,2,1,0]], df[[21,22,23,24,25]]], axis=1)
df3

Unnamed: 0_level_0,4,3,2,1,0,21,22,23,24,25
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,962778,913263,910165,194387,722034,624433,901147,810092,74420,250714
2,659818,415453,307906,523383,672083,52476,249524,514925,495260,130336
3,332935,133500,606328,578482,261314,203739,402134,129395,230525,946149
4,238820,503008,824980,267047,709693,453065,709693,453065,709693,453065
5,7267,589816,482509,77515,246245,186586,338120,186586,338120,130652
...,...,...,...,...,...,...,...,...,...,...
715317,506580,505242,614405,198412,422547,615766,560313,553343,624094,258244
715320,604807,758868,918865,967471,970952,784715,745441,716579,732103,732103
715321,278029,791040,832786,277358,522132,681203,90781,692730,469116,592612
715322,953126,1030620,880639,722323,825005,953126,766052,82260,126631,82260


In [71]:
df3_ = df3.sample(frac=0.25)
score = {}
all_songs = meta_song.shape[0]
for n in range(1, 6):
    dcg= df3.apply(lambda x: afunc(x.to_numpy(), n), axis=1).mean()
    predict_songs = len(set(df3_[[4,3,2,1,0][5-n:]].values.ravel()))
    fill_songs = (5-n)*test_source['session_id'].nunique()
    coverage = (predict_songs+fill_songs)/all_songs
    score[n] = {'dcg':dcg, 'coverage':coverage, 'total':dcg*0.8+coverage*0.2}
rich.print(score)