In [1]:
import pandas as pd
import numpy as np
import random
import os

np.random.seed(1)
random.seed(1)
DATA_DIR = "C:\\Users\\Ben\\OneDrive\\ZaloSongComp\\zalo-hit-song-prediction\\csv"
TRAININFO = os.path.join(DATA_DIR, "train_info.tsv")
TRAINRANK =  os.path.join(DATA_DIR, "train_rank.csv")
TESTINFO = os.path.join(DATA_DIR, "test_info.tsv")
SUBMISSION = os.path.join(DATA_DIR, "submission.csv")

# Prepare data
df_i = pd.read_csv(TRAININFO, delimiter='\t',encoding='utf-8')
df_r = pd.read_csv(TRAINRANK)
df_i_train = df_i.merge(df_r, left_on='ID', right_on='ID')
df_i_train["dataset"] = "train"

df_i_test = pd.read_csv(TESTINFO, delimiter='\t',encoding='utf-8')
df_i_test["label"] = np.nan
df_i_test["dataset"] = "test"

df = pd.concat([df_i_train, df_i_test])
df_track_info = pd.read_csv(os.path.join(DATA_DIR, "all_track_info.csv"))
df = df.merge(df_track_info, left_on='ID', right_on='ID')
df_audio_features = pd.read_csv(os.path.join(DATA_DIR, "all_track_audio_features.csv"))
df =df.merge(df_audio_features,left_on="ID",right_on="ID", how="left")

# Sort by ID
df = df.sort_values(by=['ID'])
df= df.reset_index()

df.head(3)

Unnamed: 0,index,ID,title,artist_name,artist_id,composers_name,composers_id,release_time,label,dataset,...,tonal.tuning_frequency,tonal.tuning_nontempered_energy_ratio,tonal.chords_key,tonal.chords_scale,tonal.key_edma.key,tonal.key_edma.scale,tonal.key_krumhansl.key,tonal.key_krumhansl.scale,tonal.key_temperley.key,tonal.key_temperley.scale
0,0,1073748245,Đêm Chôn Dầu Vượt Biển,Như Quỳnh,551,Châu Đình An,5765,2017-10-01 22:07:00,7.0,train,...,440.0,0.601478,D,major,G,major,G,major,G,major
1,1,1073751978,Mùa Thu Trong Mưa,Minh Tuyết,455,Trường Sa,100105,2017-10-01 20:58:00,3.0,train,...,434.193115,0.944516,C,minor,C,minor,C,minor,C,minor
2,2,1073835561,Rồi Ánh Trăng Tan,Lưu Bích,450,Quốc Bảo,4355,2017-11-01 18:16:00,6.0,train,...,434.193115,0.957651,Bb,major,D,minor,D,minor,D,minor


In [None]:
df["artist_id"]

0                                551
1                                455
2                                450
3                                551
4                                513
5                               6515
6                                455
7                                455
8                                455
9                            455.306
10                               455
11                              7928
12                               901
13                              5072
14                               827
15                            828.87
16                               518
17                               918
18                              6517
19                         18241.918
20                              2247
21                              3184
22                             42005
23                             11740
24                              1331
25                               828
26                               455
2

In [None]:
from format_features import format_features
df = format_features(df)



In [10]:
df["album"] = df["album"].fillna("")
df["album"].apply(lambda x: len(x.split(" ")))


0        4
1        8
2        5
3        4
4        3
5        9
6        7
7        7
8        7
9        7
10       7
11       6
12       8
13       7
14       8
15       4
16       4
17       1
18       1
19       1
20       1
21       1
22       1
23       1
24       1
25       9
26       8
27       8
28       8
29       8
        ..
10166    1
10167    1
10168    6
10169    6
10170    6
10171    2
10172    7
10173    5
10174    6
10175    7
10176    5
10177    3
10178    3
10179    1
10180    1
10181    5
10182    1
10183    6
10184    3
10185    3
10186    1
10187    3
10188    3
10189    3
10190    3
10191    3
10192    3
10193    3
10194    3
10195    7
Name: album, Length: 10196, dtype: int64

In [5]:
chosen_features = ["album_right", "istrack11", "no_artist", "no_composer","freq_artist", "freq_composer","year", "month","hour", "day", "len_of_songname", 
                   "isRemix", "isOST", "isBeat", "isVersion", "isCover",  "num_song_release_in_final_month",
                  "length", "genre", "track","album_artist", "islyric", "album_artist_contain_artistname",
                  "len_album_name", "isRemixAlbum", "isOSTAlbum", "isSingleAlbum", "album_name_is_title_name",
                  "isBeatAlbum", "isCoverAlbum", "artist_name","composers_name","copyright" ,
                  "artist_id_min", "composers_id_min_cat",  "artist_id_max_cat", "composers_id_max_cat", 
                   "freq_artist_min", "freq_composer_min","dayofyear","weekday","isHoliday",
                  "num_album_per_min_artist", "num_album_per_min_composer", 
                   "numsongInAlbum","isSingleAlbum_onesong" ]

all_features_in_order = {"album":"category", # album name from mp3 metadata textual
                          "len_album_name":"int64",
                          "isRemixAlbum":"category",
                          "isOSTAlbum":"category",
                          "isSingleAlbum":"category",
                          "isBeatAlbum":"category",
                          "isTopHitAlbum":"category",
                          "isCoverAlbum":"category",
                          "isEPAlbum":"category",
                          "isLienKhucAlbum":"category",
                          "album_name_is_title_name":"category",
                          "artist_name":"category",
                          "composers_name":"category",
                          "copyright":"category" ,
                          "artist_id_min":"category",
                          "artist_id_max":"category", 
                          "composers_id_min_cat":"category", 
                          "composers_id_max_cat":"category",
                          "genre":"category", 
                          "album_artist":"category", # album artist name
                          "album_artist_contain_artistname":"category",
                          "track":"float64", # float between 0 and 1 representing track_num/total_tracks
                          "istrack11":"category", # 1 if first track
                          # "lyric":"string" # Not a trainable feature
                          "islyric":"category",
                          "num_line_lyric":"int64",
                          "no_artist":"int64",
                          "no_composer":"int64",
                          #"datetime":"datetime64", # Not a trainable feature
                          "day":"category",
                          "month":"category",
                          "year":"category",
                          "hour":"category",
                          "day":"category",
                          "dayofyear":"category",
                          "weekday":"category",
                          "isHoliday":"category",
                          "len_of_songname":"int64",
                          "isRemix":"category",
                          "isOST":"category",
                          "isBeat":"category",
                          "isVersion":"category",
                          "isCover":"category",
                          "isLienKhuc":"category",
                          "day_release":"int64", # the specific day of the day across all days (> 365)  
                          #Features those that require "global" knowledge beyond that example
                          "album_right":"category", # a different representaion of "album" based on release time - can be combined with it, and using 
                          "numsongInAlbum":"category",
                          "isSingleAlbum_onesong":"category",
                          "num_song_released_that_week":'int64',
                          "num_song_release_in_final_month":"int64",  
                          "freq_artist":"int64",  # number of times the unique artist string is present in dataset
                          "freq_artist_min":"int64", # number of times the first listed artist is present in dataset
                          "num_album_per_min_artist":"int64",
                          "num_album_per_min_composer":"int64",
}
all_features_in_order_list = list(all_features_in_order.keys())
for feat_name, feat_type in all_features_in_order.items():
    df[feat_name] = df[feat_name].astype(feat_type)

KeyError: 'artist_id_min'

In [2]:
import seaborn as sns
# Fill nan album
print("There is {} ratio is nan album".format(len(df[df["album"].isnull()])/len(df)))
df["album"]  = df["album"].fillna("")
df["len_album_name"] = df["album"].apply(lambda x: len(x.split(" ")))
df["isRemixAlbum"] = [ 1 if "Remix" in t else 0 for t in df["album"]]
df["isOSTAlbum"] = [ 1 if "OST" in t else 0 for t in df["album"]]
df["isSingleAlbum"] = [ 1 if "Single" in t else 0 for t in df["album"]]
df["isBeatAlbum"] = [ 1 if "Beat" in t else 0 for t in df["album"]]
df["isTopHitAlbum"] = [ 1 if "Top Hits" in t else 0 for t in df["album"]]
df["isCoverAlbum"] = [ 1 if "Cover" in t else 0 for t in df["album"]]
df["isEPAlbum"] = [ 1 if "EP" in t else 0 for t in df["album"]]
df["isLienKhucAlbum"] = [ 1 if "Liên Khúc" in t else 0 for t in df["album"]]

df["album_name_is_title_name"]= [1 if r.title in r.album  else 0 for i,r in df.iterrows() ]
df["album"] = df["album"].astype('category')
df["album"] =  df["album"].cat.codes

df["artist_name_cat"] = df["artist_name"].astype('category')
df["artist_name_cat"] =  df["artist_name_cat"].cat.codes
df["composers_name_cat"] = df["composers_name"].astype('category')
df["composers_name_cat"] =  df["composers_name_cat"].cat.codes
df["copyright_cat"] = df["copyright"].astype('category')
df["copyright_cat"] =  df["copyright_cat"].cat.codes

import re
def get_min_artist_id(s):
    ps = re.split(',|\.',s)
    ps = [int(p) for p in ps]
    return np.min(ps)

def get_max_artist_id(s):
    ps = re.split(',|\.',s)
    ps = [int(p) for p in ps]
    return np.max(ps)

df["artist_id_min"]=  df["artist_id"].apply(lambda x: get_min_artist_id(x))
df["artist_id_min_cat"] = df["artist_id_min"].astype('category')
df["artist_id_min_cat"] =  df["artist_id_min_cat"].cat.codes

df["composers_id_min"]=  df["composers_id"].apply(lambda x: get_min_artist_id(x))
df["composers_id_min_cat"] = df["composers_id_min"].astype('category')
df["composers_id_min_cat"] =  df["composers_id_min_cat"].cat.codes

df["artist_id_max"]=  df["artist_id"].apply(lambda x: get_max_artist_id(x))
df["artist_id_max_cat"] = df["artist_id_max"].astype('category')
df["artist_id_max_cat"] =  df["artist_id_max_cat"].cat.codes

df["composers_id_max"]=  df["composers_id"].apply(lambda x: get_max_artist_id(x))
df["composers_id_max_cat"] = df["composers_id_max"].astype('category')
df["composers_id_max_cat"] =  df["composers_id_max_cat"].cat.codes

#New feature
# df["group_album_artist_id_min_cat"] = df.groupby(["album","artist_id_min_cat"]).ngroup()
# df["group_album_artist_id_min_cat"] = df["group_album_artist_id_min_cat"].astype("category").cat.codes
# df["group_album_artist_id_max_cat"] = df.groupby(["album","artist_id_max_cat"]).ngroup()
# df["group_album_artist_id_max_cat"] = df["group_album_artist_id_max_cat"].astype("category").cat.codes


# Fill genre
print("There is {} ratio is nan genre".format(len(df[df["genre"].isnull()])/len(df)))
df["genre"]  = df["genre"].fillna("No genre")
df["genre"] = df["genre"].astype('category')
df["genre"] =  df["genre"].cat.codes

# Fill album_artist
print("There is {} ratio is nan album_artist".format(len(df[df["album_artist"].isnull()])/len(df)))
df["album_artist"]  = df["album_artist"].fillna("No album_artist")
df["album_artist_contain_artistname"]= [1 if r.album_artist in r.artist_name  else 0 for i,r in df.iterrows() ]
df["album_artist"] = df["album_artist"].astype('category')
df["album_artist"] =  df["album_artist"].cat.codes

# Fill track
print("There is {} ratio is nan track".format(len(df[df["track"].isnull()])/len(df)))
df["track"]  = df["track"].fillna("(1, 1)")
df["istrack11"] = df["track"] == "(1, 1)"
def tracknum_to_value(track_num):
    try:
        
        track_num = make_tuple(track_num)
        if track_num[0] is not None:
            return float(track_num[0]) / float(track_num[1])
        else:
            return 1.0
    except:
        return 1.0

df["track"] = df["track"].apply(lambda t: tracknum_to_value(t))


# Fill lyric
print("There is {} ratio is nan lyric".format(len(df[df["lyric"].isnull()])/len(df)))
df["lyric"]  = df["lyric"].fillna("")
df["islyric"] = df["lyric"].apply(lambda x:  True if len(x)  else False)
df["num_line_lyric"] = df["lyric"].apply(lambda x : len(x.split("\r")))


#--------------------------------------------------------
from dateutil import relativedelta
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from ast import literal_eval as make_tuple
df['no_artist'] = df.artist_name.apply(lambda x: len(x.split(",")))
df['no_composer'] = df.composers_name.apply(lambda x: len(x.split(",")))
df["freq_artist"] = df.groupby('artist_id')['artist_id'].transform('count').astype('float')
df["freq_composer"] = df.groupby('composers_id')['composers_id'].transform('count').astype('float')
df["freq_artist_min"] = df.groupby('artist_id_min_cat')['artist_id_min_cat'].transform('count').astype('float')
df["freq_composer_min"] = df.groupby('composers_id_min_cat')['composers_id_min_cat'].transform('count').astype('float')

df["num_album_per_min_artist"] = df.groupby(['artist_id_min_cat','album'])['album'].transform('count').astype('float')
df["num_album_per_min_composer"] = df.groupby(['composers_id_min','album'])['album'].transform('count').astype('float')


df["datetime"] = pd.to_datetime(df.release_time)
df["year"] = df["datetime"].dt.year
df["month"] = df["datetime"].dt.month
df["hour"] = df["datetime"].dt.hour
df["day"] = df["datetime"].dt.day
df["dayofyear"] = df["datetime"].dt.dayofyear
df["weekday"] = df["datetime"].dt.weekday
from datetime import date 
import holidays 

in_holidays = holidays.HolidayBase() 
for i in range(26,32):
    in_holidays.append(str(i)+'-01-2017')
in_holidays.append('01-02-2017')
for i in range(14,21):
    in_holidays.append(str(i)+'-02-2018')
in_holidays.append('30-04-2017')
in_holidays.append('30-04-2018')
in_holidays.append('01-01-2017')
in_holidays.append('01-01-2018')
in_holidays.append('14-02-2017')
in_holidays.append('14-02-2018')
in_holidays.append('08-03-2017')
in_holidays.append('08-03-2018')
in_holidays.append('01-05-2017')
in_holidays.append('01-05-2018')
in_holidays.append('06-04-2017')
in_holidays.append('25-04-2018')
in_holidays.append('01-06-2017')
in_holidays.append('01-06-2018')
in_holidays.append('04-10-2017')
in_holidays.append('24-09-2018')
in_holidays.append('20-10-2017')
in_holidays.append('20-10-2018')
in_holidays.append('20-11-2017')
in_holidays.append('20-11-2018')
in_holidays.append('24-12-2017')
in_holidays.append('24-12-2018')
df['isHoliday'] = df.release_time.apply(lambda x: x in in_holidays)



df["len_of_songname"] = df["title"].apply(lambda x: len(x.split(" ")))
df["isRemix"] = [ 1 if "Remix" in t else 0 for t in df["title"]]
df["isOST"] = [ 1 if "OST" in t else 0 for t in df["title"]]
df["isBeat"] = [ 1 if "Beat" in t else 0 for t in df["title"]]
df["isVersion"] = [ 1 if "Version" in t else 0 for t in df["title"]]
df["isCover"] = [ 1 if "Cover" in t else 0 for t in df["title"]]
df["isLienKhuc"] = [ 1 if "Liên Khúc" in t else 0 for t in df["title"]]



def find_num_song_release_in_final_month(df, day):
    month5th = day + relativedelta.relativedelta(months=5)
    month6th = day + relativedelta.relativedelta(months=6)  
    return len(df.datetime[(df.datetime >= month5th)&(df.datetime<=month6th)])



df["num_song_release_in_final_month"] = df.datetime.apply(lambda d:find_num_song_release_in_final_month(df ,d))

#It seems like all songs on albums release at the same time, so groupby by release_time will create album 
df["album_right"] = df.groupby(df.release_time).ngroup().astype("category").cat.codes
df["day_release"] = df.groupby(["year","dayofyear"]).ngroup().astype("category").cat.codes
df["numsongInAlbum"] = df.groupby("album_right")["album_right"].transform("count")
df["isSingleAlbum_onesong"]= df["isSingleAlbum"] & (df["numsongInAlbum"]==1)

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split

chosen_features = ["album_right", "istrack11", "no_artist", "no_composer","freq_artist", "freq_composer","year", "month","hour", "day", "len_of_songname", 
                   "isRemix", "isOST", "isBeat", "isVersion", "isCover",  "num_song_release_in_final_month",
                  "length", "genre", "track","album_artist", "islyric", "album_artist_contain_artistname",
                  "len_album_name", "isRemixAlbum", "isOSTAlbum", "isSingleAlbum", "album_name_is_title_name",
                  "isBeatAlbum", "isCoverAlbum", "artist_name_cat","composers_name_cat","copyright_cat" ,
                  "artist_id_min_cat", "composers_id_min_cat",  "artist_id_max_cat", "composers_id_max_cat", 
                   "freq_artist_min", "freq_composer_min","dayofyear","weekday","isHoliday",
                  "num_album_per_min_artist", "num_album_per_min_composer", 
                   "numsongInAlbum","isSingleAlbum_onesong" ]

df_train = df[df.dataset=="train"]
df_test = df[df.dataset=="test"]

param = {
    'bagging_freq': 20,          
    'bagging_fraction': 0.95,   'boost_from_average':'false',   
    'boost': 'gbdt',             'feature_fraction': 0.1,     'learning_rate': 0.001,
    'max_depth': -1,             'metric':'root_mean_squared_error', 'min_data_in_leaf': 5,   
       'num_leaves': 50,            
    'num_threads': 4,              'tree_learner': 'serial',   'objective': 'regression',
    'reg_alpha': 0.1002650970728192, 'reg_lambda': 0.1003427518866501,'verbosity': 1,
    "seed": 99999
}

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=99999)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
labels= df_train.label
# fig, axes = plt.subplots(5, 1, figsize=(10, 10*5))
# axes = axes.flat
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, df_train.label.values)):
    print("Fold {}".format(fold_))

    trn_data = lgb.Dataset(df_train.iloc[trn_idx][chosen_features], label=labels.iloc[trn_idx])
    val_data = lgb.Dataset(df_train.iloc[val_idx][chosen_features], label=labels.iloc[val_idx])
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 20000)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][chosen_features], num_iteration=clf.best_iteration)
    predictions += clf.predict(df_test[chosen_features], num_iteration=clf.best_iteration) / folds.n_splits



There is 0.24038838760298156 ratio is nan album
There is 0.0017653981953707335 ratio is nan genre
There is 0.24038838760298156 ratio is nan album_artist
There is 0.0007846214201647705 ratio is nan track
There is 0.6722244017261672 ratio is nan lyric
Fold 0
Training until validation scores don't improve for 20000 rounds
[5000]	training's rmse: 1.74708	valid_1's rmse: 1.95719
[10000]	training's rmse: 1.43107	valid_1's rmse: 1.79029
[15000]	training's rmse: 1.24494	valid_1's rmse: 1.71663
[20000]	training's rmse: 1.11329	valid_1's rmse: 1.67337
[25000]	training's rmse: 1.01312	valid_1's rmse: 1.64622
[30000]	training's rmse: 0.932046	valid_1's rmse: 1.62791
[35000]	training's rmse: 0.865129	valid_1's rmse: 1.6149
[40000]	training's rmse: 0.808398	valid_1's rmse: 1.60554
[45000]	training's rmse: 0.759385	valid_1's rmse: 1.5982
[50000]	training's rmse: 0.715781	valid_1's rmse: 1.5933
[55000]	training's rmse: 0.677387	valid_1's rmse: 1.58914
[60000]	training's rmse: 0.642672	valid_1's rmse: 

[135000]	training's rmse: 0.352768	valid_1's rmse: 1.59449
Early stopping, best iteration is:
[117539]	training's rmse: 0.395041	valid_1's rmse: 1.59427
Fold 5
Training until validation scores don't improve for 20000 rounds
[5000]	training's rmse: 1.74416	valid_1's rmse: 1.97861
[10000]	training's rmse: 1.42737	valid_1's rmse: 1.808
[15000]	training's rmse: 1.24048	valid_1's rmse: 1.73443
[20000]	training's rmse: 1.10854	valid_1's rmse: 1.69264
[25000]	training's rmse: 1.00847	valid_1's rmse: 1.66628
[30000]	training's rmse: 0.927733	valid_1's rmse: 1.64903
[35000]	training's rmse: 0.860953	valid_1's rmse: 1.63626
[40000]	training's rmse: 0.804731	valid_1's rmse: 1.62695
[45000]	training's rmse: 0.755754	valid_1's rmse: 1.61981
[50000]	training's rmse: 0.711956	valid_1's rmse: 1.615
[55000]	training's rmse: 0.673466	valid_1's rmse: 1.61074
[60000]	training's rmse: 0.638754	valid_1's rmse: 1.60766
[65000]	training's rmse: 0.607396	valid_1's rmse: 1.60541
[70000]	training's rmse: 0.57903