In [3]:
import pandas as pd
from pathlib import Path
data_root = Path('../kkdata3/')
train_source = pd.read_parquet(data_root / 'label_train_source.parquet')
train_target = pd.read_parquet(data_root / 'label_train_target.parquet')
test_source = pd.read_parquet(data_root / 'label_test_source.parquet')
meta_song = pd.read_parquet(data_root / 'meta_song.parquet')
#meta_song_composer = pd.read_parquet(data_root / 'meta_song_composer.parquet')
#meta_song_genre = pd.read_parquet(data_root / 'meta_song_genre.parquet')
#meta_song_lyricist = pd.read_parquet(data_root / 'meta_song_lyricist.parquet')
#meta_song_producer = pd.read_parquet(data_root / 'meta_song_producer.parquet')
#meta_song_titletext = pd.read_parquet(data_root / 'meta_song_titletext.parquet')


train_source.dtypes

session_id          int64
song_id            object
unix_played_at      int64
play_status         int64
login_type          int64
listening_order     int64
dtype: object

In [10]:
# map song_id to song_index to save memory and speed up 
meta_song['song_index'] = meta_song.index
train_source = train_source.merge(meta_song[['song_id', 'song_index']], on='song_id', how='left')
train_target = train_target.merge(meta_song[['song_id', 'song_index']], on='song_id', how='left')
test_source = test_source.merge(meta_song[['song_id', 'song_index']], on='song_id', how='left')
del train_source['song_id']
del train_target['song_id']
del test_source['song_id']

In [11]:
import rich
print("train_source")
rich.print(train_source.dtypes)
print("train_target")
rich.print(train_target.dtypes)
print("test_source")
rich.print(test_source.dtypes)
print("meta_song")
rich.print(meta_song.dtypes)



train_source


train_target


test_source


meta_song


In [14]:
train_Y = train_target.pivot_table(index='session_id', columns='listening_order', values='song_index', aggfunc='first', fill_value=0)

In [15]:
train_X = train_source.pivot_table(index='session_id', columns='listening_order', values='song_index', aggfunc='first', fill_value=0)

In [28]:
df = pd.concat([train_X[[16,17,18,19,20]], train_Y], axis=1)
df

listening_order,16,17,18,19,20,21,22,23,24,25
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,962778,913263,910165,194387,722034,624433,901147,810092,74420,250714
2,659818,415453,307906,523383,672083,52476,249524,514925,495260,130336
3,332935,133500,606328,578482,261314,203739,402134,129395,230525,946149
4,503008,824980,267047,238820,709693,453065,709693,453065,709693,453065
5,7267,589816,482509,77515,246245,186586,338120,186586,338120,130652
...,...,...,...,...,...,...,...,...,...,...
715317,506580,505242,614405,198412,422547,615766,560313,553343,624094,258244
715320,604807,758868,918865,967471,970952,784715,745441,716579,732103,732103
715321,791040,791040,832786,277358,522132,681203,90781,692730,469116,592612
715322,880639,722323,825005,247342,832762,953126,766052,82260,126631,82260


In [18]:
for i in range(17,21):
    print(i, (df[i]==df[21]).mean())

17 0.05118311813357239
18 0.047941578900462904
19 0.06867694522934545
20 0.057650469455264135


In [25]:
df['score'] = 0.
w = [1.0, 0.5, ]
w =  [1.0, 0.63, 0.5, 0.43, 0.38]
for j in range(21, 26):
    df['score'] += (df[[j]].values == df[[16,17,18,19,20]]).any(axis=1)*w[j-21]
s5 = df['score'].mean()*0.8
s5

0.21284453368142753

In [55]:
import numpy as np
import numba
@numba.jit(nopython=True)
def afunc(x:np.ndarray, n:int):
    w =[1.0, 0.63, 0.5, 0.43, 0.38]
    t = list(x[5-n:5])
    s = x[5:]
    score = 0.
    for i in range(5):
        if s[i] in t:
            t.remove(s[i])
            score += w[i]
    return score
df_ = df.sample(frac=0.25)
score = {}
all_songs = meta_song.shape[0]
for n in range(1, 6):
    dcg= df.apply(lambda x: afunc(x.to_numpy(), n), axis=1).mean()
    predict_songs = len(set(df_[[16, 17,18,19,20][5-n:]].values.ravel()))
    fill_songs = (5-n)*test_source['session_id'].nunique()
    coverage = (predict_songs+fill_songs)/all_songs
    score[n] = {'dcg':dcg, 'coverage':coverage, 'total':dcg*0.8+coverage*0.2}
rich.print(score)

In [56]:
df2 = train_source[['session_id', 'song_index']].drop_duplicates().groupby('session_id').tail(5)

In [57]:
df2

Unnamed: 0,session_id,song_index
5,751,13609
7,751,972005
10,751,464634
11,751,870557
16,751,28802
...,...,...
11445163,458622,889765
11445164,458622,779020
11445165,458622,912111
11445166,458622,862546


In [67]:
df2['song_order'] = df2.iloc[::-1].groupby('session_id')['song_index'].cumcount()
Z = df2.pivot_table(index='session_id', columns='song_order', values='song_index', aggfunc='first', fill_value=-1)
Z

song_order,0,1,2,3,4
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,722034,194387,910165,913263,962778
2,672083,523383,307906,415453,659818
3,261314,578482,606328,133500,332935
4,709693,267047,824980,503008,238820
5,246245,77515,482509,589816,7267
...,...,...,...,...,...
715317,422547,198412,614405,505242,506580
715320,970952,967471,918865,758868,604807
715321,522132,277358,832786,791040,278029
715322,825005,722323,880639,1030620,953126


In [68]:
Z.dtypes

song_order
0    int64
1    int64
2    int64
3    int64
4    int64
dtype: object

In [69]:
df3 = pd.concat([Z[[4,3,2,1,0]], df[[21,22,23,24,25]]], axis=1)
df3

Unnamed: 0_level_0,4,3,2,1,0,21,22,23,24,25
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,962778,913263,910165,194387,722034,624433,901147,810092,74420,250714
2,659818,415453,307906,523383,672083,52476,249524,514925,495260,130336
3,332935,133500,606328,578482,261314,203739,402134,129395,230525,946149
4,238820,503008,824980,267047,709693,453065,709693,453065,709693,453065
5,7267,589816,482509,77515,246245,186586,338120,186586,338120,130652
...,...,...,...,...,...,...,...,...,...,...
715317,506580,505242,614405,198412,422547,615766,560313,553343,624094,258244
715320,604807,758868,918865,967471,970952,784715,745441,716579,732103,732103
715321,278029,791040,832786,277358,522132,681203,90781,692730,469116,592612
715322,953126,1030620,880639,722323,825005,953126,766052,82260,126631,82260


In [71]:
df3_ = df3.sample(frac=0.25)
score = {}
all_songs = meta_song.shape[0]
for n in range(1, 6):
    dcg= df3.apply(lambda x: afunc(x.to_numpy(), n), axis=1).mean()
    predict_songs = len(set(df3_[[4,3,2,1,0][5-n:]].values.ravel()))
    fill_songs = (5-n)*test_source['session_id'].nunique()
    coverage = (predict_songs+fill_songs)/all_songs
    score[n] = {'dcg':dcg, 'coverage':coverage, 'total':dcg*0.8+coverage*0.2}
rich.print(score)