In [12]:
import pandas as pd
import numpy as np
import random
import re
np.random.seed(1)
random.seed(1)

from base_tokenizer import BaseTokenizer
from utils import load_n_grams
from dict_models import LongMatchingTokenizer

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.svm import SVR

In [18]:
TRAININFO = "../data/train_info.tsv"
TRAINRANK =  "../data/train_rank.csv"
TESTINFO = "../data/test_info.tsv"
Track_info = "../data/all_track_info.csv"
Audio_info = "../data/all_track_audio_features.csv"

In [56]:
df_i = pd.read_csv(TRAININFO, delimiter='\t',encoding='utf-8')
df_r = pd.read_csv(TRAINRANK)
df_i_train = df_i.merge(df_r, left_on='ID', right_on='ID')
df_i_train["dataset"] = "train"

df_i_test = pd.read_csv(TESTINFO, delimiter='\t',encoding='utf-8')
df_i_test["label"] = np.nan
df_i_test["dataset"] = "test"

df = pd.concat([df_i_train, df_i_test])
df_track_info = pd.read_csv(Track_info)
df = df.merge(df_track_info, left_on='ID', right_on='ID')
df_audio_features = pd.read_csv(Audio_info)
df =df.merge(df_audio_features,left_on="ID",right_on="ID", how="left")
df = df[['ID','title','artist_name','lyric','label','dataset']]
df.head()

Unnamed: 0,ID,title,artist_name,lyric,label,dataset
0,1073748245,Đêm Chôn Dầu Vượt Biển,Như Quỳnh,Đêm nay anh gánh dầu ra biển anh chôn \r\nAnh ...,7.0,train
1,1073751978,Mùa Thu Trong Mưa,Minh Tuyết,Chiều mưa không có em\r\nbờ đá công viên âm th...,3.0,train
2,1073835561,Rồi Ánh Trăng Tan,Lưu Bích,Rồi ánh trăng cũng đang tan dần\r\nRồi ước mơ ...,6.0,train
3,1073856553,Còn Thương Rau Đắng Mọc Sau Hè,Như Quỳnh,Nắng hạ đi Mây trôi lang thang cho hạ buồn Coi...,2.0,train
4,1073929630,Người Điên Biết Yêu,Như Loan,"Ai trong tình yêu, ai không mơ mộng?\n\nNgu ng...",7.0,train


In [20]:
df.dropna(subset=['lyric'], inplace=True)

def cleanString(string):
  return re.sub('\W+',' ', string )
df.lyric = df.lyric.apply(lambda x: cleanString(x))
df.lyric = df.lyric.apply(lambda x: x.lower())

df_train = df[df.dataset=='train']
df_test = df[df.dataset=='test']
print(df_train.shape)
df_train.drop_duplicates(subset='lyric', keep='first',inplace=True)
print(df_train.shape)

df_train.head()

(2981, 6)
(2488, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,ID,title,artist_name,lyric,label,dataset
0,1073748245,Đêm Chôn Dầu Vượt Biển,Như Quỳnh,đêm nay anh gánh dầu ra biển anh chôn anh chôn...,7.0,train
1,1073751978,Mùa Thu Trong Mưa,Minh Tuyết,chiều mưa không có em bờ đá công viên âm thầm ...,3.0,train
2,1073835561,Rồi Ánh Trăng Tan,Lưu Bích,rồi ánh trăng cũng đang tan dần rồi ước mơ cũn...,6.0,train
3,1073856553,Còn Thương Rau Đắng Mọc Sau Hè,Như Quỳnh,nắng hạ đi mây trôi lang thang cho hạ buồn coi...,2.0,train
4,1073929630,Người Điên Biết Yêu,Như Loan,ai trong tình yêu ai không mơ mộng ngu ngơ từn...,7.0,train


## TF_IDF vectorizer

In [None]:
vectorizer = TfidfVectorizer(tokenizer=LongMatchingTokenizer().tokenize, norm='l2')
vectors = vectorizer.fit_transform(df_train.lyric)
dense = vectors.todense()

skf = StratifiedKFold(n_splits=10, random_state=99999)
parameters = {}
def rmse(targets, predictions):
    return np.sqrt(mean_squared_error(targets, predictions))
rmse_scorer = make_scorer(rmse, greater_is_better=False)

## GaussianNB

In [39]:
nb = GaussianNB()
gs_clf = RandomizedSearchCV(nb, 
                      parameters, 
                      scoring=rmse_scorer, 
                      cv=skf,
                      n_jobs=-1,
                      return_train_score=True,
                      error_score='raise',
                      n_iter=100,
                      verbose=10,
                    )

gs_clf.fit(dense, df_train.label)
print(gs_clf.cv_results_)
print(gs_clf.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    5.3s remaining:   12.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    6.8s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    7.4s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.8s finished


{'mean_fit_time': array([0.66140532]), 'std_fit_time': array([0.12973313]), 'mean_score_time': array([0.20834618]), 'std_score_time': array([0.03620178]), 'params': [{}], 'split0_test_score': array([-3.8422449]), 'split1_test_score': array([-3.72618601]), 'split2_test_score': array([-3.96938282]), 'split3_test_score': array([-4.21805642]), 'split4_test_score': array([-4.07087214]), 'split5_test_score': array([-3.88282181]), 'split6_test_score': array([-3.9904119]), 'split7_test_score': array([-3.79110401]), 'split8_test_score': array([-4.17573164]), 'split9_test_score': array([-4.02644321]), 'mean_test_score': array([-3.96877765]), 'std_test_score': array([0.15271417]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([-1.46135912]), 'split1_train_score': array([-1.45226535]), 'split2_train_score': array([-1.40358917]), 'split3_train_score': array([-1.38484069]), 'split4_train_score': array([-1.42648258]), 'split5_train_score': array([-1.31996048]), 'split6_train

error valid set:-3.968
std valid set: 0.152
error train set:-1.41
std train set: 0.038

## SVR 

In [42]:
svr = SVR(gamma='scale', C=4)
gs_clf = RandomizedSearchCV(svr, 
                      parameters, 
                      scoring=rmse_scorer, 
                      cv=skf,
                      n_jobs=-1,
                      return_train_score=True,
                      error_score='raise',
                      n_iter=100,
                      verbose=10,
                    )

gs_clf.fit(dense, df_train.label)
print(gs_clf.cv_results_)
print(gs_clf.best_score_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.7min remaining:  6.4min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  2.9min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.9min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.9min finished


{'mean_fit_time': array([80.74698358]), 'std_fit_time': array([5.69428658]), 'mean_score_time': array([8.99076247]), 'std_score_time': array([0.63681479]), 'params': [{}], 'split0_test_score': array([-2.90156741]), 'split1_test_score': array([-2.91793282]), 'split2_test_score': array([-2.89497495]), 'split3_test_score': array([-2.9731734]), 'split4_test_score': array([-2.91293237]), 'split5_test_score': array([-2.87760706]), 'split6_test_score': array([-2.93436199]), 'split7_test_score': array([-2.82274332]), 'split8_test_score': array([-2.9686052]), 'split9_test_score': array([-2.81125501]), 'mean_test_score': array([-2.90164677]), 'std_test_score': array([0.05086039]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([-2.84178944]), 'split1_train_score': array([-2.83683902]), 'split2_train_score': array([-2.83942866]), 'split3_train_score': array([-2.82750552]), 'split4_train_score': array([-2.84059723]), 'split5_train_score': array([-2.84462504]), 'split6_trai

error valid set:-2.901
std valid set: 0.05
error train set:-2.84
std train set: 0.008

# Personalize with artist_name and keep the stopwords

In [57]:
df.dropna(subset=['lyric'], inplace=True)

def cleanString(string):
  return re.sub('\W+',' ', string )
df.lyric = df.lyric.apply(lambda x: cleanString(x))
df.lyric = df.lyric.apply(lambda x: x.lower())

def artist_func(string):
  return ''.join(string.split(' '))+ "_"
df['artist_name'] = df['artist_name'].apply(lambda x: artist_func(x))
df['lyric'] = df['artist_name'] + df['lyric']

df_train = df[df.dataset=='train']
df_test = df[df.dataset=='test']
print(df_train.shape)
df_train.drop_duplicates(subset='lyric', keep='first',inplace=True)
print(df_train.shape)

df_train.head()

(2981, 6)
(2766, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,ID,title,artist_name,lyric,label,dataset
0,1073748245,Đêm Chôn Dầu Vượt Biển,NhưQuỳnh_,NhưQuỳnh_đêm nay anh gánh dầu ra biển anh chôn...,7.0,train
1,1073751978,Mùa Thu Trong Mưa,MinhTuyết_,MinhTuyết_chiều mưa không có em bờ đá công viê...,3.0,train
2,1073835561,Rồi Ánh Trăng Tan,LưuBích_,LưuBích_rồi ánh trăng cũng đang tan dần rồi ướ...,6.0,train
3,1073856553,Còn Thương Rau Đắng Mọc Sau Hè,NhưQuỳnh_,NhưQuỳnh_nắng hạ đi mây trôi lang thang cho hạ...,2.0,train
4,1073929630,Người Điên Biết Yêu,NhưLoan_,NhưLoan_ai trong tình yêu ai không mơ mộng ngu...,7.0,train


## Hashing vectorize (since tf_idf couldnt be run, they said that  "The task could not be sent to the workers as it is too large for `send_bytes` SVr") 

In [86]:
def new_tokenize(string):
    m = LongMatchingTokenizer().tokenize(string.split("_")[1])
    return [string.split("_")[0]+"_"+i for i in m]

vectorizer = HashingVectorizer(tokenizer=new_tokenize, norm='l2', n_features=2**13)
vectors = vectorizer.fit_transform(df_train.lyric)
dense = vectors.todense()

## SVR

In [87]:
svr = SVR(gamma='scale', C=9)
skf = StratifiedKFold(n_splits=10, random_state=99999)
svr_clf = RandomizedSearchCV(svr, 
                      parameters, 
                      scoring=rmse_scorer, 
                      cv=skf,
                      n_jobs=-1,
                      return_train_score=True,
                      error_score='raise',
                      n_iter=100,
                      verbose=10,
                    )

svr_clf.fit(dense, df_train.label)
print(svr_clf.cv_results_)
print(svr_clf.best_score_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.5min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  2.6min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.6min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.7min finished


{'mean_fit_time': array([72.9785805]), 'std_fit_time': array([5.22883305]), 'mean_score_time': array([8.11016085]), 'std_score_time': array([0.56398312]), 'params': [{}], 'split0_test_score': array([-2.97688916]), 'split1_test_score': array([-2.99255808]), 'split2_test_score': array([-2.8919988]), 'split3_test_score': array([-3.00028577]), 'split4_test_score': array([-2.88902847]), 'split5_test_score': array([-2.71726812]), 'split6_test_score': array([-2.82788564]), 'split7_test_score': array([-2.9044934]), 'split8_test_score': array([-2.95196821]), 'split9_test_score': array([-2.91689452]), 'mean_test_score': array([-2.90718464]), 'std_test_score': array([0.0811742]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([-2.64261658]), 'split1_train_score': array([-2.64103785]), 'split2_train_score': array([-2.67055401]), 'split3_train_score': array([-2.64312366]), 'split4_train_score': array([-2.65842935]), 'split5_train_score': array([-2.68891836]), 'split6_train_

error -2.907

# personalize with artist_name and remove stopwords

In [109]:
with open("../vietnamese-stopwords.txt", 'r') as f:
    filecontent=f.readlines()
stopwords = list(set([f.strip() for f in filecontent]))
for i in range(0,len(stopwords)):
    stopwords[i] = "_".join(stopwords[i].split(" "))

def new_tokenize2(string):
    m = LongMatchingTokenizer().tokenize(string.split("_")[1])
    m = [i for i in m if i not in stopwords]
    return [string.split("_")[0]+"_"+i for i in m]

vectorizer = HashingVectorizer(tokenizer=new_tokenize2, norm='l2', n_features=2**13)
vectors = vectorizer.fit_transform(df_train.lyric)
dense = vectors.todense()

## SVR

In [110]:
svr = SVR(gamma='scale', C=9)
skf = StratifiedKFold(n_splits=10, random_state=99999)
svr_clf = RandomizedSearchCV(svr, 
                      parameters, 
                      scoring=rmse_scorer, 
                      cv=skf,
                      n_jobs=-1,
                      return_train_score=True,
                      error_score='raise',
                      n_iter=100,
                      verbose=10,
                    )

svr_clf.fit(dense, df_train.label)
print(svr_clf.cv_results_)
print(svr_clf.best_score_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.5min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  2.6min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.6min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.7min finished


{'mean_fit_time': array([72.68586118]), 'std_fit_time': array([5.04923071]), 'mean_score_time': array([8.05854468]), 'std_score_time': array([0.55405814]), 'params': [{}], 'split0_test_score': array([-2.99613118]), 'split1_test_score': array([-3.00596384]), 'split2_test_score': array([-2.94200921]), 'split3_test_score': array([-2.9774999]), 'split4_test_score': array([-2.92702805]), 'split5_test_score': array([-2.78642743]), 'split6_test_score': array([-2.88110816]), 'split7_test_score': array([-2.9376594]), 'split8_test_score': array([-2.96576237]), 'split9_test_score': array([-2.95496778]), 'mean_test_score': array([-2.93766034]), 'std_test_score': array([0.06082552]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([-2.69793226]), 'split1_train_score': array([-2.69682635]), 'split2_train_score': array([-2.71381583]), 'split3_train_score': array([-2.7033087]), 'split4_train_score': array([-2.71031486]), 'split5_train_score': array([-2.73997943]), 'split6_train