# Sentiment Analysis
Using: https://towardsdatascience.com/how-to-analyze-emotions-and-words-of-the-lyrics-from-your-favorite-music-artist-bbca10411283

In [None]:
!pip install wordcloud

In [None]:
#download everything
!pip install --upgrade pip

In [None]:
!pip install helpers

In [None]:
%run helpers.py

In [None]:
#libraries used to extract, clean and manipulate the data
from helpers import *
import pandas as pd
import os
#To plot the graphs
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn')
#library used to count the frequency of words
from sklearn.feature_extraction.text import CountVectorizer
#To create the sentiment analysis model, tokenization and lemmatization
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.data
nltk.download('vader_lexicon')
nltk.download('punkt')

In [None]:
# export folder path
export_path = os.path.join("..", "datasets", "SentimentAnalysis")
if not os.path.exists(export_path):
  os.makedirs(export_path)

In [None]:
#load data
#import data
df2017 = pd.read_csv ('../datasets/CleanData/2017_cleaned_songs_lyrics.csv')

df2018 = pd.read_csv ('../datasets/CleanData/2018_cleaned_songs_lyrics.csv')

df2019 = pd.read_csv ('../datasets/CleanData/2019_cleaned_songs_lyrics.csv')

df2020 = pd.read_csv ('../datasets/CleanData/2020_cleaned_songs_lyrics.csv')

df2021 = pd.read_csv ('../datasets/CleanData/2021_cleaned_songs_lyrics.csv')

In [None]:
df2017

### Key things to note for helpers.py
- cleaning lyrics: changed all words to lowercase, removed extra characters
- lyrics to words function: removed stopwords, lemmatized, removed punctuation

In [None]:
df2017['Year'] = 2017
df2017 = df2017.iloc[:,1:]
df2018['Year'] = 2018
df2018 = df2018.iloc[:,1:]
df2019['Year'] = 2019
df2019 = df2019.iloc[:,1:]
df2020['Year'] = 2020
df2020 = df2020.iloc[:,1:]
df2021['Year'] = 2021
df2021 = df2021.iloc[:,1:]

In [None]:
df2019.head()

In [None]:
df = df2017
df

In [None]:
df1 = df2018

In [None]:
df2 = df2019

In [None]:
df3 = df2020

In [None]:
df4 = df2021

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.shape

# Creating unique words and examining them (Year 2017)

In [None]:
df['Year'] = 2017
df

In [None]:
def unique(list1):
   # intilize a null list
     unique_list = []
   # traverse for all elements
     for x in list1:
         # check if exists in unique_list or not
         if x not in unique_list:
              unique_list.append(x)
     return unique_list

In [None]:
#Stores unique words of each lyrics song into a new column called words
#list used to store the words
words = []

#iterate trought each lyric and split unique words appending the result into the words list
df = df.reset_index(drop=True)
for word in df['lemmatized_lyrics'].tolist():
    words.append(unique(lyrics_to_words(word).split()))
    
#create the new column with the information of words lists
df['words'] = words
df.head()

In [None]:
#Create a new dataframe of all the  words used in lyrics and its decades
#list used to store the information
set_words = []
set_year = []

#Iterate trought each word and decade and stores them into the new lists
for i in df.index:
    for word in df['words'].iloc[i]:
        set_words.append(word)
        set_year.append(df['Year'].iloc[i])
        
#create the new data frame  with the information of words and decade lists
words_df = pd.DataFrame({'words':set_words,'Year':set_year})

# count the frequency of each word that aren't on the stop_words lists
cv = CountVectorizer()

#Create a dataframe called data_cv to store the the number of times the word was used in  a lyric based their decades
text_cv = cv.fit_transform(words_df['words'].iloc[:])
data_cv = pd.DataFrame(text_cv.toarray(),columns=cv.get_feature_names())
data_cv['Year'] = words_df['Year']

#created a dataframe that Sums the ocurrence frequency of each word and group the result by decade
vect_words = data_cv.groupby('Year').sum().T
vect_words = vect_words.reset_index(level=0).rename(columns ={'index':'words'})
vect_words = vect_words.rename_axis(columns='')

#Save the data into a csv file
vect_words.to_csv(os.path.join(export_path, 'words2017.csv'), index=False)
vect_words

# Creating unique words and examining them (Year 2018)

In [None]:
df1

In [None]:
#Stores unique words of each lyrics song into a new column called words
#list used to store the words
words = []

#iterate trought each lyric and split unique words appending the result into the words list
df1 = df1.reset_index(drop=True)
for word in df1['lemmatized_lyrics'].tolist():
    words.append(unique(lyrics_to_words(word).split()))
    
#create the new column with the information of words lists
df1['words'] = words
df1.head()

In [None]:
#Create a new dataframe of all the  words used in lyrics and its decades
#list used to store the information
set_words = []
set_year = []

#Iterate trought each word and decade and stores them into the new lists
for i in df1.index:
    for word in df1['words'].iloc[i]:
        set_words.append(word)
        set_year.append(df1['Year'].iloc[i])
#create the new data frame  with the information of words and decade lists
words_df1 = pd.DataFrame({'words':set_words,'Year':set_year})

# count the frequency of each word that aren't on the stop_words lists
cv = CountVectorizer()

#Create a dataframe called data_cv to store the the number of times the word was used in  a lyric based their decades
text_cv = cv.fit_transform(words_df1['words'].iloc[:])
data_cv = pd.DataFrame(text_cv.toarray(),columns=cv.get_feature_names())
data_cv['Year'] = words_df1['Year']

#created a dataframe that Sums the ocurrence frequency of each word and group the result by decade
vect_words = data_cv.groupby('Year').sum().T
vect_words = vect_words.reset_index(level=0).rename(columns ={'index':'words'})
vect_words = vect_words.rename_axis(columns='')

#Save the data into a csv file
vect_words.to_csv(os.path.join(export_path, 'words2018.csv'), index=False)
vect_words

# Creating unique words and examining them (Year 2019)

In [None]:
#Stores unique words of each lyrics song into a new column called words
#list used to store the words
words = []

#iterate trought each lyric and split unique words appending the result into the words list
df2 = df2.reset_index(drop=True)
for word in df2['lemmatized_lyrics'].tolist():
    words.append(unique(lyrics_to_words(word).split()))
    
#create the new column with the information of words lists
df2['words'] = words
df2.head()

In [None]:
#Create a new dataframe of all the  words used in lyrics and its decades
#list used to store the information
set_words = []
set_year = []

#Iterate trought each word and decade and stores them into the new lists
for i in df2.index:
    for word in df2['words'].iloc[i]:
        set_words.append(word)
        set_year.append(df2['Year'].iloc[i])
        
#create the new data frame  with the information of words and decade lists
words_df2 = pd.DataFrame({'words':set_words,'Year':set_year})

# count the frequency of each word that aren't on the stop_words lists
cv = CountVectorizer()

#Create a dataframe called data_cv to store the the number of times the word was used in  a lyric based their decades
text_cv = cv.fit_transform(words_df2['words'].iloc[:])
data_cv = pd.DataFrame(text_cv.toarray(),columns=cv.get_feature_names())
data_cv['Year'] = words_df2['Year']

#created a dataframe that Sums the ocurrence frequency of each word and group the result by decade
vect_words = data_cv.groupby('Year').sum().T
vect_words = vect_words.reset_index(level=0).rename(columns ={'index':'words'})
vect_words = vect_words.rename_axis(columns='')

#Save the data into a csv file
vect_words.to_csv(os.path.join(export_path, 'words2019.csv'), index=False)
vect_words

# Creating unique words and examining them (Year 2020)

In [None]:
def unique(list1):
   # intilize a null list
     unique_list = []
   # traverse for all elements
     for x in list1:
         # check if exists in unique_list or not
         if x not in unique_list:
              unique_list.append(x)
     return unique_list

In [None]:
#Stores unique words of each lyrics song into a new column called words

#list used to store the words
words = []
#iterate trought each lyric and split unique words appending the result into the words list
df3 = df3.reset_index(drop=True)
for word in df3['lemmatized_lyrics'].tolist():
    words.append(unique(lyrics_to_words(word).split()))
    
#create the new column with the information of words lists
df3['words'] = words
df3.head()

In [None]:
#Create a new dataframe of all the  words used in lyrics and its decades

#list used to store the information
set_words = []
set_year = []
#Iterate trought each word and decade and stores them into the new lists
for i in df3.index:
    for word in df3['words'].iloc[i]:
        set_words.append(word)
        set_year.append(df3['Year'].iloc[i])
#create the new data frame  with the information of words and decade lists
words_df3 = pd.DataFrame({'words':set_words,'Year':set_year})

# count the frequency of each word that aren't on the stop_words lists
cv = CountVectorizer()

#Create a dataframe called data_cv to store the the number of times the word was used in  a lyric based their decades
text_cv = cv.fit_transform(words_df3['words'].iloc[:])
data_cv = pd.DataFrame(text_cv.toarray(),columns=cv.get_feature_names())
data_cv['Year'] = words_df3['Year']

#created a dataframe that Sums the ocurrence frequency of each word and group the result by decade
vect_words = data_cv.groupby('Year').sum().T
vect_words = vect_words.reset_index(level=0).rename(columns ={'index':'words'})
vect_words = vect_words.rename_axis(columns='')

#Save the data into a csv file
vect_words.to_csv(os.path.join(export_path, 'words2020.csv'), index=False)
vect_words

# Creating unique words and examining them (Year 2021)

In [None]:
def unique(list1):
   # intilize a null list
     unique_list = []
   # traverse for all elements
     for x in list1:
         # check if exists in unique_list or not
         if x not in unique_list:
              unique_list.append(x)
     return unique_list

In [None]:
#Stores unique words of each lyrics song into a new column called words

#list used to store the words
words = []
#iterate trought each lyric and split unique words appending the result into the words list
df4 = df4.reset_index(drop=True)
for word in df4['lemmatized_lyrics'].tolist():
    words.append(unique(lyrics_to_words(word).split()))
    
#create the new column with the information of words lists
df4['words'] = words
df4.head()

In [None]:
#for you to examine the dataset separately
# df.to_csv('SentAna.csv')

In [None]:
#Create a new dataframe of all the  words used in lyrics and its decades
#list used to store the information
set_words = []
set_year = []

#Iterate trought each word and decade and stores them into the new lists
for i in df4.index:
    for word in df4['words'].iloc[i]:
        set_words.append(word)
        set_year.append(df4['Year'].iloc[i])
#create the new data frame  with the information of words and decade lists
words_df4 = pd.DataFrame({'words':set_words,'Year':set_year})

# count the frequency of each word that aren't on the stop_words lists
cv = CountVectorizer()
#Create a dataframe called data_cv to store the the number of times the word was used in  a lyric based their decades
text_cv = cv.fit_transform(words_df4['words'].iloc[:])
data_cv = pd.DataFrame(text_cv.toarray(),columns=cv.get_feature_names())
data_cv['Year'] = words_df4['Year']

#created a dataframe that Sums the ocurrence frequency of each word and group the result by decade
vect_words = data_cv.groupby('Year').sum().T
vect_words = vect_words.reset_index(level=0).rename(columns ={'index':'words'})
vect_words = vect_words.rename_axis(columns='')

#Save the data into a csv file
vect_words.to_csv(os.path.join(export_path, 'words2021.csv'), index=False)
vect_words

# Import the unique words dataframe back

In [None]:
#load data
#import data
df_unique_2017 = pd.read_csv(os.path.join(export_path, 'words2017.csv'))
df_unique_2017.rename(columns={"2017":"Count"},inplace=True)
# df2017.head()

df_unique_2018 = pd.read_csv(os.path.join(export_path, 'words2018.csv'))
df_unique_2018.rename(columns={"2018":"Count"},inplace=True)
# df2018.head()

df_unique_2019 =  pd.read_csv(os.path.join(export_path, 'words2019.csv'))
df_unique_2019.rename(columns={"2019":"Count"},inplace=True)
# df2019.head()

df_unique_2020 = pd.read_csv(os.path.join(export_path, 'words2020.csv'))
df_unique_2020.rename(columns={"2020":"Count"},inplace=True)
# df2020.head()

df_unique_2021 = pd.read_csv(os.path.join(export_path, 'words2021.csv'))
df_unique_2021.rename(columns={"2021":"Count"},inplace=True)
# df2021.head()

In [None]:
df_unique_2018

In [None]:
# dfAll =pd.Dataframe()
dfAll=pd.concat([df_unique_2017, df_unique_2018,df_unique_2019,df_unique_2020,df_unique_2021])
dfAll.dropna(inplace=True)
dfAll

In [None]:
# dfbefcovid =df_unique_2017
dfbefcovid=pd.concat([df_unique_2017, df_unique_2018,df_unique_2019])
dfbefcovid

In [None]:
dfcovid=pd.concat([df_unique_2020,df_unique_2021])
dfcovid

# Word Cloud

In [None]:
dfAll

In [None]:
dfAll.isnull().values.any()
dfAll['Count'].astype(int)
dfAll["Count"]= dfAll["Count"].fillna(0).astype(int)

In [None]:
def plot_wordcloud(dfAll,row,col):
    wc = WordCloud(background_color="white",colormap="Dark2",max_font_size=100,random_state=15)
    fig = plt.figure(figsize=(20,10))
     
    for index, value in enumerate(dfAll.columns[1:]):
        top_dict = dict(zip(dfAll['words'].tolist(),dfAll[value].tolist()))
        wc.generate_from_frequencies(top_dict)
        plt.subplot(row,col,index+1)
        plt.imshow(wc,interpolation="bilinear")
        plt.axis("off")
        plt.title(f"{value}",fontsize=15)
plt.subplots_adjust(wspace=0.1, hspace=0.1)
plt.show()
#Plot the word cloud
plot_wordcloud(dfAll,2,2)

In [None]:
# plot bef covid
def plot_wordcloud(dfbefcovid,row,col):
    wc = WordCloud(background_color="white",colormap="Dark2",max_font_size=100,random_state=15)
    fig = plt.figure(figsize=(20,10))
     
    for index, value in enumerate(dfbefcovid.columns[1:]):
        top_dict = dict(zip(dfbefcovid['words'].tolist(),dfbefcovid[value].tolist()))
        wc.generate_from_frequencies(top_dict)
        plt.subplot(row,col,index+1)
        plt.imshow(wc,interpolation="bilinear")
        plt.axis("off")
        plt.title(f"{value}",fontsize=15)
plt.subplots_adjust(wspace=0.1, hspace=0.1)
plt.show()
#Plot the word cloud
plot_wordcloud(dfbefcovid,2,2)

In [None]:
# plot covid
def plot_wordcloud(dfcovid,row,col):
    wc = WordCloud(background_color="white",colormap="Dark2",max_font_size=100,random_state=15)
    fig = plt.figure(figsize=(20,10))
     
    for index, value in enumerate(dfcovid.columns[1:]):
        top_dict = dict(zip(dfcovid['words'].tolist(),dfcovid[value].tolist()))
        wc.generate_from_frequencies(top_dict)
        plt.subplot(row,col,index+1)
        plt.imshow(wc,interpolation="bilinear")
        plt.axis("off")
        plt.title(f"{value}",fontsize=15)
plt.subplots_adjust(wspace=0.1, hspace=0.1)
plt.show()
#Plot the word cloud
plot_wordcloud(dfcovid,2,2)

# Sentiment

In [None]:
df=pd.concat([df2017,df2018,df2019,df2020,df2021])
df

In [None]:
#Create lists to store the different scores for each word
negative = []
neutral = []
positive = []
compound = []

#Initialize the model
sid = SentimentIntensityAnalyzer()

#Iterate for each row of lyrics and append the scores
for i in df.index:
    scores = sid.polarity_scores(df['lemmatized_lyrics'].iloc[i])
    negative.append(scores['neg'])
    neutral.append(scores['neu'])
    positive.append(scores['pos'])
    compound.append(scores['compound'])
    
#Create 4 columns to the main data frame  for each score
df['negative'] = negative
df['neutral'] = neutral
df['positive'] = positive
df['compound'] = compound
df = df.iloc[:,1:]
df

In [None]:
for name, group in df.groupby('Year'):
    plt.scatter(group['positive'],group['negative'],label=name,s=3)
    plt.legend(fontsize=10)

plt.title("Lyrics Sentiments by Year (Overall)")
plt.xlabel('Positive Valence')
plt.ylabel('Negative  Valence')
plt.figure(figsize=(60, 60))
plt.show()

In [None]:
means_df = df.groupby(['Year']).mean()
means_df

In [None]:
for name, group in means_df.groupby('Year'):
    plt.scatter(group['positive'],group['negative'],label=name)
    plt.legend()

plt.title("Lyrics Sentiments by Year")
plt.xlabel('Positive Valence')
plt.ylabel('Negative  Valence')
plt.show()

# Before Covid

In [None]:
# bef covid 
df_befcovid=pd.concat([df2017,df2018,df2019])
df_befcovid

In [None]:
#Create lists to store the different scores for each word
negative = []
neutral = []
positive = []
compound = []

#Initialize the model
sid = SentimentIntensityAnalyzer()

#Iterate for each row of lyrics and append the scores
for i in df_befcovid.index:
    scores = sid.polarity_scores(df_befcovid['lemmatized_lyrics'].iloc[i])
    negative.append(scores['neg'])
    neutral.append(scores['neu'])
    positive.append(scores['pos'])
    compound.append(scores['compound'])
    
#Create 4 columns to the main data frame  for each score
df_befcovid['negative'] = negative
df_befcovid['neutral'] = neutral
df_befcovid['positive'] = positive
df_befcovid['compound'] = compound
df_befcovid = df_befcovid.iloc[:,1:]
df_befcovid

In [None]:
for name, group in df_befcovid.groupby('Year'):
    plt.scatter(group['positive'],group['negative'],label=name, s=3)
    plt.legend(fontsize=10)

plt.title("Lyrics Sentiments by Year (2017-2019)")
plt.xlabel('Positive Valence')
plt.ylabel('Negative  Valence')
plt.figure(figsize=(250, 250))
plt.show()

In [None]:
means_df_bef = df_befcovid.groupby(['Year']).mean()
means_df_bef

In [None]:
for name, group in means_df_bef.groupby('Year'):
    plt.scatter(group['positive'],group['negative'],label=name)
    plt.legend()

plt.title("Lyrics Sentiments: Scatter plot of Positive Score and Negative Plot for means by Year (2017-2019)")
plt.xlabel('Positive Valence')
plt.ylabel('Negative  Valence')
plt.show()

In [None]:
# max value for multiple column
prediction = df_befcovid[['negative', 'neutral','positive']].idxmax(axis=1)

In [None]:
# Add predictions to dataframe
if 'Sentiment' in df_befcovid.columns:
    df_befcovid.drop(columns=['Sentiment'],inplace=True)
df_befcovid['Sentiment'] = prediction.tolist()

# predictions_df
df_befcovid

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Sentiment", data=df_befcovid)
plt.title("Lyrics Sentiments by Year (2017-2019) - Pre-covid")

# show count (+ annotate)
for rect in ax.patches:
    ax.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 0.75,rect.get_height(),horizontalalignment='center', fontsize = 11)

In [None]:
Positive = df_befcovid[df_befcovid["Sentiment"] == 'positive']
number_of_rows_positive = len(Positive)


Negative = df_befcovid[df_befcovid["Sentiment"] == 'negative']
number_of_rows_negative = len(Negative)

Neutral = df_befcovid[df_befcovid["Sentiment"] == 'neutral']
number_of_rows_neutral= len(Neutral)

print("Number of positive rows:", number_of_rows_positive)
print("Number of negative rows:", number_of_rows_negative)
print("Number of neutral rows:", number_of_rows_neutral)

In [None]:
rows = [number_of_rows_positive, number_of_rows_negative,number_of_rows_neutral]

mylabels=["Positive","Negative","Neutral"]
colors=['#ADD8E6', '#FFC0CB', '#808000']
plt.pie(rows, labels = mylabels,colors=colors,startangle = 90,explode=[0.1, 0.1, 0.1],autopct='%1.2f%%')
plt.title("Lyrics Sentiments by Year in Percentage(2017-2019) - Pre-covid")
plt.legend()
plt.axis('equal')
plt.show() 

# Covid

In [None]:
# Covid 
df_covid=pd.concat([df2020,df2021])
df_covid

In [None]:
#Create lists to store the different scores for each word
negative = []
neutral = []
positive = []
compound = []

#Initialize the model
sid = SentimentIntensityAnalyzer()

#Iterate for each row of lyrics and append the scores
for i in df_covid.index:
    scores = sid.polarity_scores(df_covid['lemmatized_lyrics'].iloc[i])
    negative.append(scores['neg'])
    neutral.append(scores['neu'])
    positive.append(scores['pos'])
    compound.append(scores['compound'])
    
#Create 4 columns to the main data frame  for each score
df_covid['negative'] = negative
df_covid['neutral'] = neutral
df_covid['positive'] = positive
df_covid['compound'] = compound
df_covid = df_covid.iloc[:,1:]
df_covid

In [None]:
df_covid

In [None]:
for name, group in df_covid.groupby('Year'):
    plt.scatter(group['positive'],group['negative'],label=name, s=3)
    plt.legend(fontsize=10)

plt.title("Lyrics Sentiments by Year (2020-2021) - Covid")
plt.xlabel('Positive Valence')
plt.ylabel('Negative  Valence')
plt.figure(figsize=(250, 250))
plt.show()

In [None]:
means_df_covid = df_covid.groupby(['Year']).mean()
means_df_covid

In [None]:
for name, group in means_df_covid.groupby('Year'):
    plt.scatter(group['positive'],group['negative'],label=name)
    plt.legend()

plt.title("Lyrics Sentiments: Scatter plot of Positive Score and Negative Plot for means by Year (2020-2021)")
plt.xlabel('Positive Valence')
plt.ylabel('Negative  Valence')
plt.show()

In [None]:
# max value for multiple column
predictions = df_covid[['negative', 'neutral','positive']].idxmax(axis=1)

In [None]:
# Add predictions to dataframe
if 'Sentiment' in df_covid.columns:
    df_covid.drop(columns=['Sentiment'],inplace=True)
df_covid['Sentiment'] = predictions.tolist()

# predictions_df
df_covid

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Sentiment", data=df_covid)
plt.title("Lyrics Sentiments by Year (2020-2021) - Covid")

# show count (+ annotate)
for rect in ax.patches:
    ax.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 0.75,rect.get_height(),horizontalalignment='center', fontsize = 11)

In [None]:
Positive = df_covid[df_covid["Sentiment"] == 'positive']
number_of_rows_positive = len(Positive)


Negative = df_covid[df_covid["Sentiment"] == 'negative']
number_of_rows_negative = len(Negative)

Neutral = df_covid[df_covid["Sentiment"] == 'neutral']
number_of_rows_neutral= len(Neutral)

print("Number of positive rows:", number_of_rows_positive)
print("Number of negative rows:", number_of_rows_negative)
print("Number of neutral rows:", number_of_rows_neutral)

In [None]:
rows = [number_of_rows_positive, number_of_rows_negative,number_of_rows_neutral]

mylabels=["Positive","Negative","Neutral"]
colors=['#ADD8E6', '#FFC0CB', '#808000']
plt.pie(rows, labels = mylabels,colors=colors,startangle = 90,explode=[0.1, 0.1, 0.1],autopct='%1.2f%%')
plt.legend()
plt.title("Lyrics Sentiments by Year in Percentage (2020-2021) - Covid")
plt.axis('equal')
plt.show() 