# Lyric Models
----

In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import pickle

In [2]:
import re

def clear_text(text):
    #print(text)
    t = re.sub(r"[^a-zA-Z']", " ", text)
    clean = " ".join(t.split())
    return clean

In [3]:
# load data
lyrics_data = pd.read_csv('data/lyrics.csv')
lyrics = lyrics_data[~lyrics_data.text.isin(["Music", "Music Music", "nan", "music", 'np.nan'])]
print(len(lyrics))
lyrics['clean_text'] = [clear_text(x) for x in lyrics['text'].astype(str).str.lower()]
lyrics['clean_text'] = lyrics['clean_text'].str.replace("'", "")
lyrics.head()

356


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['clean_text'] = [clear_text(x) for x in lyrics['text'].astype(str).str.lower()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['clean_text'] = lyrics['clean_text'].str.replace("'", "")


Unnamed: 0.1,Unnamed: 0,song_id,lyrics,text,clean_text
0,0,2,DecodingResult(audio_features=tensor([[-0.6818...,Can't find their friends to make just mine Pro...,cant find their friends to make just mine prob...
3,3,5,DecodingResult(audio_features=tensor([[-0.5512...,"I ought to leave my main, I ought to leave my ...",i ought to leave my main i ought to leave my m...
8,8,13,DecodingResult(audio_features=tensor([[-0.6676...,Now what do I do? He is dressed like a sifter ...,now what do i do he is dressed like a sifter j...
10,10,18,DecodingResult(audio_features=tensor([[-0.7462...,"I love you, Satan, all the love you take And s...",i love you satan all the love you take and sle...
11,11,19,DecodingResult(audio_features=tensor([[-0.3821...,"I'm thirsty land, wait, we'll never be proud W...",im thirsty land wait well never be proud well ...


In [4]:
annot = pd.read_csv('data/annotations/static_annotations.csv')
annot.head()

Unnamed: 0,song_id,mean_arousal,std_arousal,mean_valence,std_valence
0,2,3.1,0.99443,3.0,0.66667
1,3,3.5,1.8409,3.3,1.7029
2,4,5.7,1.4944,5.5,1.7159
3,5,4.4,2.1187,5.3,1.9465
4,7,5.8,1.5492,6.4,1.7764


In [5]:
# split train test
split_idx = int(len(lyrics) * 0.8)
train_set = lyrics.iloc[:split_idx]
train_text = train_set['clean_text']
test_set = lyrics.iloc[split_idx:]
test_text = test_set['text']

train_val = annot[annot['song_id'].isin(train_set.song_id)]['mean_valence']
test_val = annot[annot['song_id'].isin(test_set.song_id)]['mean_valence']
test_text = test_text.astype(str)

for x in [train_text, train_val, test_text, test_val]:
    print(len(x))

284
284
72
72


In [7]:
# process nltk
stop_words = stopwords.words('english')
vec = TfidfVectorizer(stop_words=stop_words)
    
def tfidf_preprocessing(train_feature_text, test_feature_text):
    train_tfidf = vec.fit_transform(train_feature_text)
    test_tfidf = vec.transform(test_feature_text)
    
    return train_tfidf, test_tfidf

train_tfidf, test_tfidf = tfidf_preprocessing(train_text, test_text)

print(train_tfidf.shape)
print(test_tfidf.shape)

alltext = np.concatenate((train_tfidf.toarray(), test_tfidf.toarray()), axis=0)
print(len(alltext))
allval = pd.concat([train_val, test_val])
print(len(allval))

(284, 1062)
(72, 1062)
356
356


In [8]:
print('--LinReg--')
lr = LinearRegression().fit(train_tfidf, train_val)
preds = lr.predict(test_tfidf)
rmse = mean_squared_error(test_val, preds)**0.5
print(f"RMSE = {rmse}")
print('')

print('--RandomForest--')
lr = RandomForestRegressor().fit(train_tfidf, train_val)
preds = lr.predict(test_tfidf)
rmse = mean_squared_error(test_val, preds)**0.5
print(f"RMSE = {rmse}")
print('')

print('--XGB--')
lr = xgb.XGBRegressor().fit(train_tfidf, train_val)
preds = lr.predict(test_tfidf)
rmse = mean_squared_error(test_val, preds)**0.5
print(f"RMSE = {rmse}")
print('')

--LinReg--
RMSE = 1.780974179115716

--RandomForest--
RMSE = 1.152635067687446

--XGB--
RMSE = 1.201261394727843



In [9]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [8, 10], 
    'random_state': [31],
    #'bootstrap': [True],
    #'max_samples': [0.9]
} 

rf_clf = GridSearchCV(RandomForestRegressor(), rf_params, 
                      n_jobs=-1, 
                      cv=5, 
                      refit=False, 
                      verbose=3, 
                      scoring='neg_root_mean_squared_error')
rf_clf.fit(train_tfidf, np.ravel(train_val))

print(f"RMSE = {rf_clf.best_score_} --> {rf_clf.best_params_}")

# Untuned
# RMSE = -1.3162567681740127 --> {'max_depth': None, 'n_estimators': 100, 'random_state': 31}


# RMSE = -1.2813491444004481 --> {'max_depth': 8, 'n_estimators': 100, 'random_state': 31}
# RMSE = -1.2818443240251427 --> {'bootstrap': True, 'max_depth': 10, 'max_samples': 0.9, 'n_estimators': 100, 'random_state': 31}

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 3/5] END max_depth=8, n_estimators=100, random_state=31;, score=-1.402 total time=   1.2s
[CV 1/5] END max_depth=8, n_estimators=100, random_state=31;, score=-1.108 total time=   1.3s
[CV 4/5] END max_depth=8, n_estimators=100, random_state=31;, score=-1.224 total time=   1.4s
[CV 2/5] END max_depth=8, n_estimators=100, random_state=31;, score=-1.304 total time=   1.4s
[CV 5/5] END max_depth=8, n_estimators=100, random_state=31;, score=-1.369 total time=   1.5s
[CV 3/5] END max_depth=8, n_estimators=200, random_state=31;, score=-1.396 total time=   2.8s
[CV 1/5] END max_depth=8, n_estimators=200, random_state=31;, score=-1.115 total time=   3.0s
[CV 2/5] END max_depth=8, n_estimators=200, random_state=31;, score=-1.306 total time=   3.2s
[CV 1/5] END max_depth=10, n_estimators=100, random_state=31;, score=-1.111 total time=   1.7s
[CV 4/5] END max_depth=8, n_estimators=200, random_state=31;, score=-1.223 total time=   3.3s

In [10]:
lr = RandomForestRegressor(max_depth=8, n_estimators=100, random_state=31).fit(alltext, allval)
#pickle.dump(lr, open('audiologic/models/rf_lyric_model.pkl', 'wb'))