In [97]:
%load_ext autoreload
%autoreload 2

from happysadsongs.data import *

import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
pre_df = get_training_data()
pre_df

Unnamed: 0,text,word_label
0,i am thankful that she continues to feel comfo...,happy
1,i want to do it the right way oh orihime whisp...,happy
2,i was gaining weight getting a lot stronger an...,happy
3,i feel that there is no way to determine if a ...,happy
4,i feel sure that i wouldnt have gained so much...,happy
...,...,...
42480,This hurts to hear because we know it's true. ...,sad
42481,Yep happening to me all the time at my college...,sad
42482,"Shit, I lost the original page. Please forgive...",sad
42483,I want to die,sad


In [4]:
pre_df = get_training_data()
df = pre_df.copy()[pre_df['word_label'] != 'angry']

df['clean_text'] = df['text'].apply(clean, rem_punc=True)
emotion_dict = {'happy': 0, 'sad': 1}
df['label'] = df.word_label.replace(emotion_dict)

lyrics = get_test_lyrics()

train_df, eval_df = train_test_split(df, test_size=0.2)

In [41]:
full_lyrics = get_test_lyrics()

In [43]:
full_dict = emotion_dict = {'happy': 0, 'sad': 1, 'angry': 1}

# Happy / Sad only

In [42]:
lyrics = full_lyrics.where(full_lyrics.label != 'angry').dropna().reset_index(drop=True)

In [37]:
lyrics

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,artist,label,lyrics
0,0.0,0.0,If I Die Young,Naya Rivera,sad,"Text\nIf I die young, bury me in satin\nLay me..."
1,1.0,1.0,Angie,The Rolling Stones,sad,"Angie, Angie\nWhen will those clouds all disap..."
2,2.0,2.0,Pretty Sad,XYLØ,sad,"[Intro]\n(Feeling pretty sad, pretty, pretty s..."
3,3.0,3.0,Tear In Your Hand,Tori Amos,sad,All the world just stopped now_x000D_\nSo you ...
4,4.0,4.0,Canvas,Shane Smith & The Saints,sad,"I had a a brother, who wasn't from my family\n..."
...,...,...,...,...,...,...
200,200.0,200.0,Dog Days Are Over,Florence + The Machine,happy,[Verse 1]\nHappiness hit her like a train on a...
201,201.0,201.0,Good Vibes,Chris Janson,happy,[Verse 1]\nI ain't watching TV today\nBad news...
202,202.0,202.0,High Hopes,Panic! At the Disco,happy,"[Intro]\nHigh, high hopes\n[Chorus]\nHad to ha..."
203,203.0,203.0,"Hey Look Ma, I Made It",Panic! At the Disco,happy,"[Verse 1]\nAll my life, been hustlin' and\nTon..."


In [56]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=10000, dtype=np.float32)

vectorizer = vectorizer.fit(train_df.clean_text)
X_train = vectorizer.transform(train_df.clean_text)
y_train = train_df.label
X_val = vectorizer.transform(eval_df.clean_text)
y_val = eval_df.label
X_test = vectorizer.transform(lyrics['lyrics'].apply(clean, rem_punc=True))
y_test = lyrics['label'].map(emotion_dict)

In [57]:
best_lr = LogisticRegression(C=2.1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
best_lr = best_lr.fit(X_train, y_train)
best_lr.score(X_test, y_test)

0.7463414634146341

In [58]:
best_lr.score(X_val, y_val)

0.8298282509638977

In [59]:
best_lr.score(X_test, y_test)

0.7463414634146341

# Combine sad/angry

In [69]:
import pickle

In [113]:
pre_df['clean_text'] = pre_df['text'].apply(clean, rem_punc=True)

In [115]:
pre_df['label'] = pre_df.word_label.replace(full_dict)

In [116]:
train_df, eval_df = train_test_split(pre_df, test_size=.2)

In [119]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=5000, dtype=np.float32)

vectorizer = vectorizer.fit(train_df.clean_text)
X_train = vectorizer.transform(train_df.clean_text)
y_train = train_df.label
X_val = vectorizer.transform(eval_df.clean_text)
y_val = eval_df.label
X_test = vectorizer.transform(full_lyrics['lyrics'].apply(clean, rem_punc=True))
y_test = full_lyrics['label'].map(full_dict)

In [120]:
best_lr = LogisticRegression(C=2.1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
best_lr = best_lr.fit(X_train, y_train)
best_lr.score(X_test, y_test)

0.7384615384615385

## Train to split sad/angry

In [136]:
new_df = get_training_data()

In [129]:
sad_angry_dict = {'sad': 1, 'angry': 2}

In [138]:
sad_angry_train = new_df.copy()[new_df['word_label'] != 'happy']

In [140]:
sad_angry_train['clean_text'] = sad_angry_train['text'].apply(clean, rem_punc=True)
sad_angry_train['label'] = sad_angry_train.word_label.replace(sad_angry_dict)

In [141]:
second_train_df, second_eval_df = train_test_split(sad_angry_train, test_size=.2)

In [143]:
test_results = full_lyrics.copy()
test_results['actual_label'] = test_results['label'].map({'happy': 0, 'sad': 1, 'angry': 2})
test_results['sad/angry_actual'] = test_results['label'].map({'happy': 0, 'sad': 1, 'angry': 1})
test_results['sad/angry'] = best_lr.predict(X_test)

In [144]:
first_pass_sad_angry = test_results[test_results['sad/angry'] == 1]

In [145]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=10000, dtype=np.float32)

vectorizer = vectorizer.fit(second_train_df.clean_text)
X_train = vectorizer.transform(second_train_df.clean_text)
y_train = second_train_df.label
X_val = vectorizer.transform(second_eval_df.clean_text)
y_val = second_eval_df.label
X_test = vectorizer.transform(first_pass_sad_angry['lyrics'].apply(clean, rem_punc=True))
y_test = first_pass_sad_angry['actual_label']

In [146]:
second_lr = LogisticRegression(C=2.1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
second_lr = second_lr.fit(X_train, y_train)
second_lr.score(X_test, y_test)

0.5396825396825397

# Train without Friends

In [54]:
limited_df = df.copy()[df['source'] != 'MELD-Friends']
limited_df

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text,label
0,0,i am thankful that she continues to feel comfo...,happy,HuggingFace,i am thankful that she continues to feel comfo...,0
1,1,i want to do it the right way oh orihime whisp...,happy,HuggingFace,i want to do it the right way oh orihime whisp...,0
2,2,i was gaining weight getting a lot stronger an...,happy,HuggingFace,i was gaining weight getting a lot stronger an...,0
3,3,i feel that there is no way to determine if a ...,happy,HuggingFace,i feel that there is no way to determine if a ...,0
4,4,i feel sure that i wouldnt have gained so much...,happy,HuggingFace,i feel sure that i wouldnt have gained so much...,0
...,...,...,...,...,...,...
42480,1315,This hurts to hear because we know it's true. ...,sad,Google_GoEmotions,this hurts to hear because we know its true th...,1
42481,1316,Yep happening to me all the time at my college...,sad,Google_GoEmotions,yep happening to me all the time at my college...,1
42482,1317,"Shit, I lost the original page. Please forgive...",sad,Google_GoEmotions,shit i lost the original page please forgive me,1
42483,1318,I want to die,sad,Google_GoEmotions,i want to die,1


In [64]:
train_df, eval_df = train_test_split(limited_df, test_size=0.2)

### Happy or Sad

In [65]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=10000, dtype=np.float32)

vectorizer = vectorizer.fit(train_df.clean_text)
X_train = vectorizer.transform(train_df.clean_text)
y_train = train_df.label
X_val = vectorizer.transform(eval_df.clean_text)
y_val = eval_df.label
X_test = vectorizer.transform(lyrics['lyrics'].apply(clean, rem_punc=True))
y_test = lyrics['label'].map(emotion_dict)

In [66]:
best_lr = LogisticRegression(C=2.1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
best_lr = best_lr.fit(X_train, y_train)
best_lr.score(X_test, y_test)

0.6829268292682927

### Happy or Sad/Angry

In [67]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=10000, dtype=np.float32)

vectorizer = vectorizer.fit(train_df.clean_text)
X_train = vectorizer.transform(train_df.clean_text)
y_train = train_df.label
X_val = vectorizer.transform(eval_df.clean_text)
y_val = eval_df.label
X_test = vectorizer.transform(full_lyrics['lyrics'].apply(clean, rem_punc=True))
y_test = full_lyrics['label'].map(full_dict)

In [68]:
best_lr = LogisticRegression(C=2.1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
best_lr = best_lr.fit(X_train, y_train)
best_lr.score(X_test, y_test)

0.7076923076923077

# Full train set (no train/val split)

In [89]:
train_data = get_training_data()

In [90]:
full_dict = emotion_dict = {'happy': 0, 'sad': 1, 'angry': 1}

In [91]:
train_data['clean_text'] = train_data['text'].apply(clean, rem_punc=True)
train_data['label'] = train_data['word_label'].map(full_dict)

In [92]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data

Unnamed: 0,text,word_label,clean_text,label
0,i feel really free i feel that i can grow wing...,happy,i feel really free i feel that i can grow wing...,0
1,@praddy06 sir.. will we have the need for umbr...,happy,sir will we have the need for umbrella today ...,0
2,i hunger for anything i feel ferocious like a ...,angry,i hunger for anything i feel ferocious like a ...,1
3,Happy to help,happy,happy to help,0
4,üî•üíÅüèº- its lit having a class with you!...,happy,üî•üíåüèº its lit having a class with you y...,0
...,...,...,...,...
42480,@josefcd904 @Reddou_Kun @deven_luca @supersoni...,angry,he also likes incurring lilys wrath,1
42481,i had lunch with an old friend and it was nice...,happy,i had lunch with an old friend and it was nice...,0
42482,i tend to err on the justice side of things an...,sad,i tend to err on the justice side of things an...,1
42483,I never said it was irrelevant. I am trying to...,angry,i never said it was irrelevant i am trying to ...,1


In [93]:
test_lyrics = get_test_lyrics().sample(frac=1).reset_index(drop=True)
test_lyrics

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,artist,label,lyrics
0,131,131,Here Comes the Sun,The Beatles,happy,"[Chorus]\nHere comes the sun, doo da doo doo\n..."
1,6,6,Haunt You,Social House,sad,[Verse 1: Mikey]\nHow do I measure up to heigh...
2,160,160,Best Night,Justice Crew,happy,"the best, this is the best night\nThis is the ..."
3,144,144,All You Need Is Love,The Beatles,happy,"ve, love, love\nLove, love, love\nLove, love, ..."
4,36,36,My Immortal,Evanescense,sad,[Verse 1]\nI'm so tired of being here\nSuppres...
...,...,...,...,...,...,...
255,43,43,When I Was Your Man,Bruno Mars,sad,[Verse 1]\nSame bed but it feels just a little...
256,209,209,Given Up,Linkin Park,angry,[Verse 1: Chester Bennington]\nWake in a sweat...
257,222,222,Pity Party,Melanie Martinez,angry,Did my invitations disappear?\nWhy'd I put my ...
258,2,2,Pretty Sad,XYLØ,sad,"[Intro]\n(Feeling pretty sad, pretty, pretty s..."


In [94]:
vectorizer = TfidfVectorizer(ngram_range = (1,5), max_df=.9, min_df=25, strip_accents="unicode", max_features=5000, dtype=np.float32)

vectorizer = vectorizer.fit(train_data.clean_text)
X_train = vectorizer.transform(train_data.clean_text)
y_train = train_data.label
X_test = vectorizer.transform(test_lyrics['lyrics'].apply(clean, rem_punc=True))
y_test = test_lyrics['label'].map(full_dict)

In [95]:
full_lr = LogisticRegression(C=2.1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
full_lr = full_lr.fit(X_train, y_train)
full_lr.score(X_test, y_test)

0.7576923076923077

In [99]:
pipe = make_pipeline(vectorizer, full_lr)

In [100]:
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe, file)


In [101]:
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

In [103]:
test_lyrics = get_test_lyrics().sample(frac=1).reset_index(drop=True)
test_lyrics

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,artist,label,lyrics
0,2,2,Pretty Sad,XYLØ,sad,"[Intro]\n(Feeling pretty sad, pretty, pretty s..."
1,142,142,Get On Your Feet,Gloria Estefan,happy,You say I know it's a waste of time\nThere's n...
2,237,237,Fuck You,Damageplan,angry,Fuck you I'm through\nI want more from you\nMy...
3,134,134,Ain't No Stoppin' Us Now,McFadden & Whitehead,happy,Ain't no stoppin' us now\nWe're on the move!\n...
4,221,221,Unstoppable,Gizzle,angry,ntin in us out\nWe hammered it out\nThis time ...
...,...,...,...,...,...,...
255,188,188,Best Day Ever,Sly & The Family Stone,happy,Woke up past 8 today\nMy neighbor paid my mete...
256,84,84,Stay (I Missed You),Lisa Loeb,sad,You say I only hear what I want to\nYou say I ...
257,215,215,Holdin On,Flume,angry,"keep on holdin' on, holdin' on\nHoldin' on, ho..."
258,66,66,Used To Love You,Gwen Stefani,sad,"[Written by Julia Michaels, Justin Tranter, Gw..."


In [104]:
my_pipeline.score(test_lyrics['lyrics'], test_lyrics['label'].map(full_dict))

0.7538461538461538

In [105]:
pipe

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float32'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=5000,
                                 min_df=25, ngram_range=(1, 5), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents='unicode',
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=2.1, class_weight=None, dual=False,
                                    fit_in

# Import Roberta Binary Model

In [153]:
from simpletransformers.classification import ClassificationModel

In [172]:
# args={
#     'train_batch_size': 8,
#     'eval_batch_size': 8,
#     'learning_rate': 1e-5, 
#     'num_train_epochs': 2, 
#     'reprocess_input_data': False,
#     'max_seq_length': 80}

bin_model = ClassificationModel('roberta', '../raw_data/twoclass_roberta_bin', use_cuda=False)


In [173]:
bin_model

<simpletransformers.classification.classification_model.ClassificationModel at 0x1c9e89eb0>

In [156]:
from happysadsongs.predict import *

In [176]:
predict_parts(bin_model, """I want blood because I'm vicious
I want your soul
I wanna own your thought
I want power
I want money
I got needs and nothing can't be bought

Greed and arrogance
Justice not in evidence

Bloodlust

I hate love
There's no profit
I'm up above and like what I see
You got a job
You make nothing
You got it all but there's more to feed

Greed and arrogance
Justice not in evidence

Bloodlust

I want blood
Delicious
I want your wallet
And everything you've got
I lost my mind
I lost my sould
I lost everything that I don't control

Greed and arrogance
Justice not in evidence

Bloodlust""")

  0%|          | 0/5 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

1

In [170]:
song2 = """Well she got her daddy's car
And she cruised through the hamburger stand now
Seems she forgot all about the library
Like she told her old man now
And with the radio blasting
Goes cruising just as fast as she can now

And she'll have fun fun fun
'Til her daddy takes the T-bird away
(Fun fun fun 'til her daddy takes the T-bird away)

Well the girls can't stand her
'Cause she walks looks and drives like an ace now
(You walk like an ace now you walk like an ace)
She makes the Indy 500 look like a Roman chariot race now
(You look like an ace now you look like an ace)
A lotta guys try to catch her
But she leads them on a wild goose chase now
(You drive like an ace now you drive like an ace)

And she'll have fun fun fun
'Til her daddy takes the T-bird away
(Fun fun fun 'til her daddy takes the T-bird away)

Well you knew all along
That your dad was gettin' wise to you now
(You shouldn't have lied now you shouldn't have lied)
And since he took your set of keys
You've been thinking that your fun is all through now
(You shouldn't have lied now you shouldn't have lied)

But you can come along with me
'Cause we gotta a lot of things to do now
(You shouldn't have lied now you shouldn't have lied)

And we'll have fun fun fun now that daddy took the T-bird away
(Fun fun fun now that daddy took the T-bird away)
And we'll have fun fun fun now that daddy took the T-bird away
(Fun fun fun now that daddy took the T-bird away)
(Wo wo wo wo woo woo woo)
(Fun fun now that daddy took the T-bird away)
(Fun fun now that daddy took the T-bird away)
(Fun fun now that daddy took the T-bird away)
(Fun fun now that daddy took the T-bird away)
(Fun fun now that daddy took the T-bird away)
(Fun fun now that daddy took the T-bird away)"""

In [175]:
predict_parts(bin_model, song2)

  0%|          | 0/17 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/3 [00:00<?, ?it/s]

0