In [60]:
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from scipy.sparse import *
from collections import Counter
import pandas as pd

In [61]:
org_train = pd.read_json("data/train.json")
org_val = pd.read_json("data/val.json")

## Also add likes

In [84]:
row_train = org_train.loc[:,["tags","like_cnt","songs"]]
row_val = org_val.loc[:,["tags","id","songs", "like_cnt"]]

In [63]:
SONG_TOP_X = 50000
TAG_TOP_X = 29160

## Average num of tags

In [9]:
songs_list = list()

total = 0

for playlist in raw_train.tags.tolist():
    total += len(playlist)

print(total/len(raw_train))

4.139453033344631


## Total number of tags and songs

In [10]:
def count_col(data):
    tmp = dict()
    ret = 0

    for row in (data):
        for song in row:
            if song not in tmp:
                tmp[song] = 0
                ret +=1 
    return ret

In [11]:
count_col(raw_train.songs.tolist())

615142

In [12]:
count_col(raw_train.tags.tolist())

29160

## Calc Portion

In [8]:
def calc_portion_x(cols,x):
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    
    x_cnt = sum(x[1] for x in cols_cnt_list[:x])
    total = sum(x[1] for x in cols_cnt_list)
    return x_cnt/total

### tag portion

In [9]:
tag_portion = calc_portion_x(raw_train.tags.tolist(),TAG_TOP_X)

In [10]:
tag_portion

0.8914641289355512

### song portion

In [11]:
song_portion = calc_portion_x(raw_train.songs.tolist(),SONG_TOP_X)

In [12]:
song_portion

0.7054667054871373

## Extract TOP_X songs and tags

In [64]:
def get_top_x(cols, x):
    """
    cols : 2D array
    x: int
    ---------------------------
    song_id : cnt
    in descending order
    """
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    top_cols_cnt = dict()

    for col, cnt in cols_cnt_list[:x]:
        top_cols_cnt[col] = cnt
    return top_cols_cnt

In [65]:
top_songs = get_top_x(raw_train.songs.tolist(), SONG_TOP_X)

assert len(top_songs) == SONG_TOP_X, "top_songs are not extracted correctly"

In [66]:
top_tags = get_top_x(raw_train.tags.tolist(), TAG_TOP_X)

assert len(top_tags) == TAG_TOP_X, "top_tags are not extracted correctly"

## Remove raw data that song_id is not in top_song and change song_id to idx

song_to_idx:
  - key: song_id (from raw data)
  - value: idx [0 : SONG_TOP_X-1]
 
idx_to_song:
   - key: idx [0 : SONG_TOP_X-1]
   - value: song_id (from raw data)

In [67]:
song_to_idx = dict()
idx_to_song = dict()
idx = 0

#make song to idx
#make idx to song
for songs in raw_train.songs.tolist():
    for song in songs:
        if song not in song_to_idx and song in top_songs:
            song_to_idx[song] = idx
            idx_to_song[idx] = song
            idx+=1

#change song id to idx
for i, row in raw_train.iterrows():
    tmp = []
    for songs in raw_train.loc[i,["songs"]]:
        for song in songs:
            if song in top_songs: tmp.append(song_to_idx[song])
    raw_train.at[i,'songs'] = tmp

In [68]:
#change te song id to idx
for i, row in raw_val.iterrows():
    tmp = []
    for songs in raw_val.loc[i,["songs"]]:
        for song in songs:
            if song in top_songs: tmp.append(song_to_idx[song])
    raw_val.at[i,'songs'] = tmp

In [69]:
assert len(song_to_idx) == SONG_TOP_X, "song_to_idx has problem"

## Remove raw data that tag is not in top_tag and change change tags from str to id

tag_to_idx:
  - key: tag_id (from raw data, str)
  - value: idx [SONG_TOP_X : TAG_TOP_X+SONG_TOP_X-1]
 
idx_to_tag:
   - key: idx [SONG_TOP_X : TAG_TOP_X+SONG_TOP_X-1]
   - value: song_id (from raw data)

In [99]:
tag_to_idx = dict()
idx_to_tag = dict()

#make song to idx
#make idx to song
for tags in raw_train.tags.tolist():
    for tag in tags:
        if tag not in tag_to_idx and tag in top_tags:
            tag_to_idx[tag] = idx
            idx_to_tag[idx] = tag
            idx+=1

#change song id to idx
for i, row in raw_train.iterrows():
    tmp = []
    for tags in raw_train.loc[i,["tags"]]:
        for tag in tags:
            if tag in top_tags: tmp.append(tag_to_idx[tag])
    raw_train.at[i,'tags'] = tmp

In [100]:
idx_to_tag

{}

In [71]:
for i, row in raw_val.iterrows():
    tmp = []
    for tags in raw_val.loc[i,["tags"]]:
        for tag in tags:
            if tag in top_tags: tmp.append(tag_to_idx[tag])
    raw_val.at[i,'tags'] = tmp

In [72]:
assert len(tag_to_idx) == TAG_TOP_X, "tag_to_idx has problem"

In [73]:
n_items = len(song_to_idx)

## Make playlist X (songs + tags ids) table

In [74]:
tr_songs = raw_train.songs.tolist()
tr_tags = raw_train.tags.tolist()
te_songs = raw_val.songs.tolist()
te_tags = raw_val.tags.tolist()

tr & te:
  - row: playlist
  - col: {song| tag}_idx (from 0 to SONG_TOP_X + TAG_TOP_X)


In [75]:
tr = []

for songs in tr_songs:
    tr.append(songs)

for i, tags in enumerate(tr_tags):
    tr[i].extend(tags)

In [76]:
te = []
for songs in te_songs:
    te.append(songs)

for i, tags in enumerate(te_tags):
    te[i].extend(tags)

## Make likes

## Change te is doen at above

In [86]:
def lil_to_csr(playlists,playlists2 = [], likes1 = [], likes2 = []):
    """
    playlists: playlist with top songs and tags
    """
    row = []
    col = []
    data = []
    te_row = len(te)
    
    for row_idx, playlist in enumerate(playlists):
        tmp_idx = 0
        for idx in playlist:
            col.append(idx)
            if tmp_idx > n_items:
                data.append(1)
            else:
                data.append(likes1[row_idx])
                tmp_idx +=1
            data.append(likes1[row_idx])
            row.append(row_idx)
    
    for row_idx, playlist in enumerate(playlists2):
        tmp_idx = 0
        for idx in playlist:
            col.append(idx)
            if tmp_idx > n_items:
                data.append(1)
            else:
                data.append(likes2[row_idx])
                tmp_idx +=1
            row.append(te_row + row_idx)
    return row, col, data

In [91]:
csr_row, csr_col, csr_data = lil_to_csr(te, tr, row_val.like_cnt.tolist(), row_train.like_cnt.tolist())

In [93]:
len(csr_data)

5271702

In [95]:
len(csr_row)

4930727

In [96]:
len(csr_col)

4930727

In [92]:
r = csr_matrix((csr_data, (csr_row, csr_col)))

ValueError: row, column, and data array must all be the same length

In [None]:
from scipy import sparse

In [None]:
sparse.save_npz("./data/preprocessed/csr.npz", r)

In [None]:
validation_len = len(te)
te_ids = org_val.id.tolist()
# n_items
# song_top_x

extra = {"v_len" : validation_len,
        "te_ids" : te_ids,
        "n_items": n_items,
        "SONG_TOP_X" : SONG_TOP_X
        }

import pickle
import os
from pathlib import Path
script_dir = os.getcwd()
data_dir = "data/preprocessed"
abs_data_path = Path(os.path.join(script_dir, data_dir))
if not os.path.exists(abs_data_path):
    os.makedirs(abs_data_path)
with open(abs_data_path/"extra.pickle", 'wb') as handle:
    pickle.dump(extra, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(abs_data_path/"song.pickle", "wb") as f:
    pickle.dump(idx_to_song,f)

with open(abs_data_path/"tag.pickle", "wb") as f:
    pickle.dump(idx_to_tag,f)

In [98]:
idx_to_tag

{50000: 615142,
 50001: 615143,
 50002: 615144,
 50003: 615145,
 50004: 615146,
 50005: 615147,
 50006: 615148,
 50007: 615149,
 50008: 615150,
 50009: 615151,
 50010: 615152,
 50011: 615153,
 50012: 615154,
 50013: 615155,
 50014: 615156,
 50015: 615157,
 50016: 615158,
 50017: 615159,
 50018: 615160,
 50019: 615161,
 50020: 615162,
 50021: 615163,
 50022: 615164,
 50023: 615165,
 50024: 615166,
 50025: 615167,
 50026: 615168,
 50027: 615169,
 50028: 615170,
 50029: 615171,
 50030: 615172,
 50031: 615173,
 50032: 615174,
 50033: 615175,
 50034: 615176,
 50035: 615177,
 50036: 615178,
 50037: 615179,
 50038: 615180,
 50039: 615181,
 50040: 615182,
 50041: 615183,
 50042: 615184,
 50043: 615185,
 50044: 615186,
 50045: 615187,
 50046: 615188,
 50047: 615189,
 50048: 615190,
 50049: 615191,
 50050: 615192,
 50051: 615193,
 50052: 615194,
 50053: 615195,
 50054: 615196,
 50055: 615197,
 50056: 615198,
 50057: 615199,
 50058: 615200,
 50059: 615201,
 50060: 615202,
 50061: 615203,
 50062: 