In [8]:
%load_ext autoreload
%autoreload 2

In [10]:
from happysadsongs.data import *

In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = get_training_data()

In [11]:
df['clean_text'] = df['text'].apply(clean, rem_punc=True)
emotion_dict = {'happy': 0, 'sad': 1, 'angry': 2}
df['label'] = df.word_label.replace(emotion_dict)

In [4]:
lyrics = get_test_lyrics()

In [15]:
train_df, eval_df = train_test_split(df, test_size=0.2)

In [16]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=10000, dtype=np.float32)

vectorizer = vectorizer.fit(train_df.clean_text)
X_train = vectorizer.transform(train_df.clean_text)
y_train = train_df.label
X_val = vectorizer.transform(eval_df.clean_text)
y_val = eval_df.label
X_test = vectorizer.transform(lyrics['lyrics'].apply(clean, rem_punc=True))
y_test = lyrics['label'].map(emotion_dict)

In [52]:
X_test

<260x3514 sparse matrix of type '<class 'numpy.float32'>'
	with 23059 stored elements in Compressed Sparse Row format>

In [17]:
lr_multi_model = LogisticRegression(C=1.8, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l1',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)

In [18]:
lr_multi_model.fit(X_train, y_train)
lr_multi_model.score(X_train, y_train)

0.8019889372719784

In [248]:
lr_multi_model.score(X_val, y_val)

0.7439096151582911

In [249]:
lr_multi_model.score(X_test, y_test)

0.6230769230769231

## Split lyrics

In [36]:
def get_split(lyric):
    l_total = []
    l_part = []
    if len(lyric.split())//25 >0:
        n = len(lyric.split())//25
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_part = lyric.split()[:40]
            l_total.append(" ".join(l_part))
        else:
            l_part = lyric.split()[w*25:w*25 + 40]
            l_total.append(" ".join(l_part))
    return l_total

In [37]:
lyrics['text'] = lyrics['lyrics'].apply(clean, rem_punc=True)
lyrics['split_text'] = lyrics['text'].apply(get_split)

In [38]:
lyrics['nlabel'] = lyrics['label'].map(emotion_dict)

In [39]:
len(lyrics['split_text'][2])

14

In [53]:
def vec_transform(lyrics):
    vecs = vectorizer.transform(lyrics)
    return vecs

In [58]:
def vec_transform_and_predict(lyrics, model):
    vecs = vectorizer.transform(lyrics)
    return model.predict(vecs)


In [60]:
vec_transform_and_predict(lyrics['split_text'][1], lr_multi_model)

array([0, 0, 0, 2, 0, 0, 2])

In [57]:
lr_multi_model.predict(vec_transform(lyrics['split_text'][1]))

array([0, 0, 0, 2, 0, 0, 2])

In [62]:
lyrics['split_preds'] = lyrics['split_text'].apply(vec_transform_and_predict, model=lr_multi_model)

In [63]:
lyrics

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,artist,label,lyrics,text,split_text,nlabel,split_preds
0,0,0,If I Die Young,Naya Rivera,sad,"Text\nIf I die young, bury me in satin\nLay me...",text if i die young bury me in satin lay me do...,[text if i die young bury me in satin lay me d...,1,"[1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 1, 0, 0, 1]"
1,1,1,Angie,The Rolling Stones,sad,"Angie, Angie\nWhen will those clouds all disap...",angie angie when will those clouds all disappe...,[angie angie when will those clouds all disapp...,1,"[0, 0, 0, 2, 0, 0, 2]"
2,2,2,Pretty Sad,XYLØ,sad,"[Intro]\n(Feeling pretty sad, pretty, pretty s...",intro feeling pretty sad pretty pretty sad sad...,[intro feeling pretty sad pretty pretty sad sa...,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,3,3,Tear In Your Hand,Tori Amos,sad,All the world just stopped now_x000D_\nSo you ...,all the world just stopped nowxd so you say yo...,[all the world just stopped nowxd so you say y...,1,"[2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2]"
4,4,4,Canvas,Shane Smith & The Saints,sad,"I had a a brother, who wasn't from my family\n...",i had a a brother who wasnt from my family we ...,[i had a a brother who wasnt from my family we...,1,"[1, 1, 0, 0, 0, 2, 1, 0, 2, 1, 1, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...
255,255,255,Fucking Hostile,Pantera,angry,"ne, two, three, four!\n\nAlmost every day\nI s...",ne two three fouralmost every day i see the sa...,[ne two three fouralmost every day i see the s...,2,"[0, 2, 2]"
256,256,256,Refuse/Resist,Sepultura,angry,A.D.\nTanks on the streets\nConfronting police...,ad tanks on the streets confronting police ble...,[ad tanks on the streets confronting police bl...,2,[2]
257,257,257,Dam That River,Alice in Chains,angry,I broke you in the canyon\nI drowned you in th...,i broke you in the canyon i drowned you in the...,[i broke you in the canyon i drowned you in th...,2,"[2, 2]"
258,258,258,Destroy Everything,Hatebreed,angry,A new life begins\n\nDestroy everything\nDestr...,a new life beginsdestroy everything destroy ev...,[a new life beginsdestroy everything destroy e...,2,"[2, 1, 1]"


In [64]:
from statistics import multimode

In [105]:
def avg_split_preds(preds):
    rounded_avg = (round(np.mean(preds)))
    return int(rounded_avg)
def mode_split_preds(preds):
    pred_mode = sorted(multimode(preds), reverse=True)
    return int(pred_mode[0])

In [106]:
lyrics['split_avg_pred'] = lyrics['split_preds'].apply(avg_split_preds)

In [107]:
lyrics['split_pred_mode'] = lyrics['split_preds'].apply(mode_split_preds)

In [111]:
from sklearn.metrics import classification_report
avg_report = classification_report(lyrics['nlabel'], lyrics['split_avg_pred'], output_dict=True)
mode_report = classification_report(lyrics['nlabel'], lyrics['split_pred_mode'], output_dict=True)

In [113]:
mode_report

{'0': {'precision': 0.7209302325581395,
  'recall': 0.5904761904761905,
  'f1-score': 0.6492146596858638,
  'support': 105},
 '1': {'precision': 0.7083333333333334,
  'recall': 0.51,
  'f1-score': 0.5930232558139535,
  'support': 100},
 '2': {'precision': 0.4117647058823529,
  'recall': 0.7636363636363637,
  'f1-score': 0.5350318471337581,
  'support': 55},
 'accuracy': 0.5961538461538461,
 'macro avg': {'precision': 0.6136760905912753,
  'recall': 0.6213708513708514,
  'f1-score': 0.5924232542111918,
  'support': 260},
 'weighted avg': {'precision': 0.6506848714441054,
  'recall': 0.5961538461538461,
  'f1-score': 0.6034485247721836,
  'support': 260}}