# IMPORTS

In [1]:
import re
import pandas as pd
import numpy as np
import requests 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import OneHotEncoder
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import gensim.downloader as api
import matplotlib.pyplot as plt  
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split

## IMPORT COMPLETE  TEXT- SENTIIMENT DATABASE

In [2]:
text_df = pd.read_csv(r'/Users/magvera/code/v-chai/happysadsongs/raw_data/emotion_dataset.csv')
text_df

Unnamed: 0.1,Unnamed: 0,text,word_label,source
0,0,i didnt feel humiliated,sad,HuggingFace
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace
3,3,i am feeling grouchy,angry,HuggingFace
4,4,ive been feeling a little burdened lately wasn...,sad,HuggingFace
...,...,...,...,...
41122,5570,There's doing stupid things when you're young....,angry,Google_GoEmotions
41123,5571,So now disabled people don‚Äôt have rights . Y...,angry,Google_GoEmotions
41124,5572,It is. I bet chantex and all the other 'quit s...,angry,Google_GoEmotions
41125,5573,[NAME] was half-blood but it wasn't a smart id...,angry,Google_GoEmotions


## IMPORT COMPLETE LABELED LYRICS DATABASE

In [61]:
lyrics_df = pd.read_csv(r'/Users/magvera/code/v-chai/happysadsongs/raw_data/fragments_songs.csv')
lyrics_df=lyrics_df[:6]
lyrics_df

Unnamed: 0,Artist,Song,lyrics,label
0,Queen,Don't stop me now,Tonight I'm gonna have myself a real good time...,happy
1,eminem,the way I am,"My tank is on empty, no patience is in me\nAnd...",angry
2,rihanna,unfaithful,I don't wanna do this anymore\nI don't wanna b...,sad
3,taylor swift,white horse,"That I'm not a princess, this ain't a fairy ta...",sad
4,metallica,st. Anger,Fuck it all and no regrets\nI hit the lights o...,angry
5,pharrel williams,happy,It might seem crazy what I'm about to say\nSun...,happy


# TEXT - SENTIMENT

## Cleaning Database

In [3]:
# Consider reducing this list further
stop_words = set(stopwords.words('english')) - {
    'into', 'against', 'myself', 'doing', 'own', 'above', 'our', 'now', 'up',
    'down', 'been', 'not', 'no', 'would', 'should', 'again', 'won', 'if',
    'only', 'yours', 'your', 'you', 'ours', 'here', 'there', 'below', 'before'
}


def get_training_data():
    # ADD CODE
    pass

def get_lyrics_data():
    # ADD CODE
    pass

def clean(text):
    # lowercase
    new_text = text.lower()

    # remove twitter handles
    new_text = re.sub(r"@\w+", '', new_text)

    # remove urls
    new_text = re.sub(r'http:\S+', '', new_text)
    new_text = re.sub(r'https:\S+', '', new_text)

    # remove punctuation
    for punctuation in string.punctuation:
        new_text = new_text.replace(punctuation, '')

    # remove numbers
    new_text = ''.join(word for word in new_text if not word.isdigit())

    return new_text


def remove_stopwords(text):
    word_list = [
        word for word in word_tokenize(text) if not word in stop_words
    ]
    return ' '.join(word_list)


def lemma_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    return ' '.join(lemmatized)


# Possibly remove single letter words
def clean_length(text):
    return [word for word in text if len(word) > 2]

In [4]:
text_df['clean_text']=text_df['text'].apply(clean)
text_df

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text
0,0,i didnt feel humiliated,sad,HuggingFace,i didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,i can go from feeling so hopeless to so damned...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing a minute to post i feel greedy wrong
3,3,i am feeling grouchy,angry,HuggingFace,i am feeling grouchy
4,4,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling a little burdened lately wasn...
...,...,...,...,...,...
41122,5570,There's doing stupid things when you're young....,angry,Google_GoEmotions,theres doing stupid things when youre young th...
41123,5571,So now disabled people don‚Äôt have rights . Y...,angry,Google_GoEmotions,so now disabled people don‚äôt have rights yo...
41124,5572,It is. I bet chantex and all the other 'quit s...,angry,Google_GoEmotions,it is i bet chantex and all the other quit smo...
41125,5573,[NAME] was half-blood but it wasn't a smart id...,angry,Google_GoEmotions,name was halfblood but it wasnt a smart idea t...


In [5]:
text_df['clean_text']=text_df['clean_text'].apply(remove_stopwords)
text_df

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text
0,0,i didnt feel humiliated,sad,HuggingFace,didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing minute post feel greedy wrong
3,3,i am feeling grouchy,angry,HuggingFace,feeling grouchy
4,4,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling little burdened lately wasnt ...
...,...,...,...,...,...
41122,5570,There's doing stupid things when you're young....,angry,Google_GoEmotions,theres doing stupid things youre young theres ...
41123,5571,So now disabled people don‚Äôt have rights . Y...,angry,Google_GoEmotions,now disabled people don‚äôt rights you get worse
41124,5572,It is. I bet chantex and all the other 'quit s...,angry,Google_GoEmotions,bet chantex quit smoking companies hope book d...
41125,5573,[NAME] was half-blood but it wasn't a smart id...,angry,Google_GoEmotions,name halfblood wasnt smart idea bring up father


In [6]:
text_df['clean_text']=text_df['clean_text'].apply(lemma_text)
text_df

Unnamed: 0.1,Unnamed: 0,text,word_label,source,clean_text
0,0,i didnt feel humiliated,sad,HuggingFace,didnt feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sad,HuggingFace,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,angry,HuggingFace,im grabbing minute post feel greedy wrong
3,3,i am feeling grouchy,angry,HuggingFace,feeling grouchy
4,4,ive been feeling a little burdened lately wasn...,sad,HuggingFace,ive been feeling little burdened lately wasnt ...
...,...,...,...,...,...
41122,5570,There's doing stupid things when you're young....,angry,Google_GoEmotions,there doing stupid thing youre young there doi...
41123,5571,So now disabled people don‚Äôt have rights . Y...,angry,Google_GoEmotions,now disabled people don‚äôt right you get worse
41124,5572,It is. I bet chantex and all the other 'quit s...,angry,Google_GoEmotions,bet chantex quit smoking company hope book doe...
41125,5573,[NAME] was half-blood but it wasn't a smart id...,angry,Google_GoEmotions,name halfblood wasnt smart idea bring up father


## ENCODING SENTIMENTS

In [7]:
text_df.word_label.unique()

array(['sad', 'angry', 'happy'], dtype=object)

In [8]:
#ONE HOT ENCODING THE WORD_LABEL column (into sad, angry and happy)

ohe = OneHotEncoder(sparse = False)
ohe.fit(text_df[['word_label']])

feature_list =ohe.categories_[0].tolist()
text_df[feature_list]=ohe.transform(text_df[['word_label']])
text_df.drop(columns=['word_label'], inplace=True)

text_df.head()

Unnamed: 0.1,Unnamed: 0,text,source,clean_text,angry,happy,sad
0,0,i didnt feel humiliated,HuggingFace,didnt feel humiliated,0.0,0.0,1.0
1,1,i can go from feeling so hopeless to so damned...,HuggingFace,go feeling hopeless damned hopeful around some...,0.0,0.0,1.0
2,2,im grabbing a minute to post i feel greedy wrong,HuggingFace,im grabbing minute post feel greedy wrong,1.0,0.0,0.0
3,3,i am feeling grouchy,HuggingFace,feeling grouchy,1.0,0.0,0.0
4,4,ive been feeling a little burdened lately wasn...,HuggingFace,ive been feeling little burdened lately wasnt ...,0.0,0.0,1.0


In [9]:
## For clean text
df_clean_text=text_df['clean_text'].tolist()
clean_text=[]
for sentence in df_clean_text:
    word_token = word_tokenize(sentence)
    clean_text.append(word_token)

## TRAIN TEST SPLIT

In [10]:
y = text_df[['angry', 'happy', 'sad']]
X= clean_text

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state = 0) # Holdout

## TRANSFER LEARNING 

In [11]:
def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=[['accuracy','Precision','Recall']])
    
    return model

model = init_model()

2021-09-08 14:11:19.453137: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
#Global Vector Twitter 200
word2vec_transfer = api.load('glove-twitter-200')

In [47]:
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)


def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

X_train_embed_2 = embedding(word2vec_transfer, X_train)
X_test_embed_2 = embedding(word2vec_transfer, X_test)

In [48]:
# Pad the training and test embedded sentences
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=500)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=500)

In [49]:
es = EarlyStopping(patience=2, restore_best_weights=True)

model = init_model()

model.fit(X_train_pad_2, y_train, 
          batch_size = 64,
          epochs=20,
          validation_split=0.3,
          
          callbacks=[es]
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<tensorflow.python.keras.callbacks.History at 0x16235c3a0>

In [50]:
res = model.evaluate(X_test_pad_2, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')
print(f'The precision evaluated on the test set is of {res[2]*100:.3f}%')
print(f'The recall evaluated on the test set is of {res[3]*100:.3f}%')


The accuracy evaluated on the test set is of 77.462%
The precision evaluated on the test set is of 80.441%
The recall evaluated on the test set is of 73.831%


# LYRICS 

## CLEANING LYRICS DATASET

In [62]:
lyrics_df['clean_lyrics']=lyrics_df['lyrics'].apply(clean)
lyrics_df['clean_lyrics']=lyrics_df['clean_lyrics'].apply(remove_stopwords)
lyrics_df['clean_lyrics']=lyrics_df['clean_lyrics'].apply(lemma_text)
lyrics_df

Unnamed: 0,Artist,Song,lyrics,label,clean_lyrics
0,Queen,Don't stop me now,Tonight I'm gonna have myself a real good time...,happy,tonight im gon na myself real good time feel a...
1,eminem,the way I am,"My tank is on empty, no patience is in me\nAnd...",angry,tank empty no patience if you offend im liftin...
2,rihanna,unfaithful,I don't wanna do this anymore\nI don't wanna b...,sad,dont wan na anymore dont wan na reason every t...
3,taylor swift,white horse,"That I'm not a princess, this ain't a fairy ta...",sad,im not princess aint fairy tale im not one you...
4,metallica,st. Anger,Fuck it all and no regrets\nI hit the lights o...,angry,fuck no regret hit light dark set need voice l...
5,pharrel williams,happy,It might seem crazy what I'm about to say\nSun...,happy,might seem crazy im say sunshine shes here you...


## ENCODING SENTIMENTS

In [63]:
lyrics_df.label.unique()

array(['happy', 'angry', 'sad'], dtype=object)

In [76]:
#ONE HOT ENCODING THE WORD_LABEL column (into sad, angry and happy)

ohe = OneHotEncoder(sparse = False)
ohe.fit(lyrics_df[['label']])

feature_list =ohe.categories_[0].tolist()
lyrics_df[feature_list]=ohe.transform(lyrics_df[['label']])
lyrics_df.drop(columns=['label'], inplace=True)

lyrics_df.head()

KeyError: "None of [Index(['label'], dtype='object')] are in the [columns]"

In [77]:
## For clean text
df_clean_lyrics=lyrics_df['clean_lyrics'].tolist()
clean_lyrics=[]
for sentence in df_clean_lyrics:
    word_token = word_tokenize(sentence)
    clean_lyrics.append(word_token)

## SPLITTING DATASET

In [78]:
y = lyrics_df[['angry', 'happy', 'sad']]
X = clean_lyrics

## EMBEDDING X AND Y

In [79]:
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)


def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

X_embed = embedding(word2vec_transfer, X)
X_pad = pad_sequences(X_embed, dtype='float32', padding='post', maxlen=500)

## EVALUATING THE MODEL ON LYRICS

In [80]:
res = model.evaluate(X_pad, y, verbose=0)

In [83]:
print(f'The accuracy evaluated on the lyrics set is of {res[1]*100:.3f}%')
print(f'The precision evaluated on the lyrics set is of {res[2]*100:.3f}%')
print(f'The recall evaluated on the lyrics set is of {res[3]*100:.3f}%')

The accuracy evaluated on the lyrics set is of 50.000%
The precision evaluated on the lyrics set is of 50.000%
The recall evaluated on the lyrics set is of 50.000%
