In [1]:
import pandas as pd
import re
import librosa
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia
import numpy as np

# Lyrics Analysis

The goal of this notebook is to determine and visually see the sentiment (positive, negative, neutral) of a song based on audio features.

In [2]:
#import and clean data
lyrics = pd.read_csv("../data/lyrics.csv")
lyrics = lyrics.loc[~(lyrics["lyrics"].isnull())]
#clean lyrics to get rid of punctuation, annotations
lyrics["lyrics"] = lyrics["lyrics"].apply(lambda x: re.sub(r"\({1}.+\){1}", "", str(x.lower().replace(",", " ").replace(".", "").replace("  ", " ").replace("[", "").replace("]", "").replace('"', ""))))
lyrics = lyrics.loc[lyrics["cover"]==False]
audiopaths = pd.read_csv("../data/metadata/paths.csv").drop("Unnamed: 0", axis=1)
info = lyrics.merge(audiopaths, left_on="title", right_on="WikiSong")


In [78]:
info.head(2)

Unnamed: 0,title,album,year,lyrics,cover,songwriters_parsed,vocals_parsed,WikiSong,MetaPath,AudioPath
0,Across the Universe,Let It Be,1968,words are flowing out like endless rain into a...,False,"[""John Lennon""]","[""Lennon""]",Across the Universe,Across the Universe_beats.csv,Across The Universe (#2).mp3
1,All I've Got to Do,With the Beatles,1963,whenever i want you around yeah all i gotta do...,False,"[""John Lennon""]","[""Lennon""]",All I've Got to Do,All I've Got To Do_beats.csv,All I've Got To Do.mp3


## Lyrics Analysis

First, we analyse the sentiment scores of each word. In order to do that, we use nltk - vader model.

In [88]:
lyrics = info["lyrics"]

To begin basic analysis of the lyrics, first we look at the actual words.

In [590]:
# eliminate of common words

common = pd.read_csv('../data/stop_words.csv')
stop_words = common["words"].tolist()

In [594]:
from collections import Counter

def most_common(lst):
    data = Counter(lst)
    return data.most_common(1)[0][0]

def stop(lst):
    arr = list(filter(None, lst))
    arr = [word for word in arr if word.strip() not in stop_words]
    return arr

In [600]:
wordlist = lyrics.apply(lambda x: stop(x.replace(";", "").replace("?", "").replace(",", "").split())).tolist()
words = []
for ls in wordlist:
    words+=ls

In [629]:
from collections import OrderedDict
from operator import itemgetter 

wordsdf = pd.DataFrame({"words": words}).reset_index()
wordsdf = wordsdf.groupby("words").count().reset_index().sort_values("index", ascending=False).reset_index(drop=True).head(15)

import plotly.express as px

fig = px.bar(wordsdf, x='words', y='index')
fig.update_layout(title_text='Beatles Most Common Words', yaxis_title="count")
fig.show()


We determine the polarity scores, and only take the compound score as that is what we will be using to quantify sentiment.

In [642]:
hal = sia()
sentiment = []
pos = []
neg = []
for lyric in lyrics:
    ps = hal.polarity_scores(lyric)
    sentiment.append(ps["compound"])
    pos.append(ps["pos"])
    neg.append(ps["neg"])
info["sentiment"] = sentiment
info["pos"] = pos
info["neg"] = neg

In [646]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(x=info['year'],
                                y=info['sentiment'],
                                mode='markers',
                                marker_color=info['year'],
                                text=info['title']))

fig.update_layout(title='Beatles Sentiment Score')
fig.show()

In [649]:
fig = go.Figure(data=go.Scatter(x=info['pos'],
                                y=info['neg'],
                                mode='markers',
                                marker_color=info['year'],
                                text=info['title']))

fig.update_layout(title='Beatles Positive v. Negative Score', yaxis_title="negative", xaxis_title="positive")
fig.show()

In [651]:
sentiment_info = info[["title", "AudioPath", "sentiment", "year"]]
sentiment_info.head()

Unnamed: 0,title,AudioPath,sentiment,year
0,Across the Universe,Across The Universe (#2).mp3,0.9313,1968
1,All I've Got to Do,All I've Got To Do.mp3,0.9623,1963
2,All My Loving,All My Loving.mp3,0.9888,1963
3,All You Need Is Love,All You Need Is Love.mp3,0.9995,1967
4,And I Love Her,And I Love Her.mp3,0.997,1964


## Loading mp3 Files
Next, we will load the mp3 files in order to begin the analysis. Then, we will extract mfccs and tempo from the audio files using librosa.

In [653]:
mfccs = []
tempos = []
for song in range(sentiment_info.shape[0]):
    load = librosa.load('Beatles/tracks/' + sentiment_info["AudioPath"].iloc[song], duration=50)
    mfcc = librosa.feature.mfcc(y=load[0], sr=load[1], n_mfcc=12)
    mfccs.append(mfcc)
    oenv = librosa.onset.onset_strength(y=load[0], sr=load[1], hop_length=512)
    tempo = librosa.beat.tempo(onset_envelope=oenv, sr=load[1])
    tempos.append(tempo)
sentiment_info["mfccs"] = mfccs
sentiment_info["tempos"] = tempos


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread in


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread in


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread instead.


PySoundFile failed. Trying audioread in

In [654]:
sentiment_info["mfccs"] = sentiment_info["mfccs"].apply(lambda x: x.T)
sentiment_info["tempos"] = sentiment_info["tempos"].apply(lambda x: x[0])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [658]:
fig = go.Figure(data=go.Scatter(x=sentiment_info['tempos'],
                                y=sentiment_info['sentiment'],
                                mode='markers',
                                marker_color=info['year'],
                                text=info['title']))

fig.update_layout(title='Beatles Tempo v. Sentiment', yaxis_title="sentiment", xaxis_title="tempo")
fig.show()

It seems like there's not obvious correlation between tempo and and sentiment. So, we will move forward with our analysis by using the mfcc.

For the sake of this analysis, any sentiment above 0.34 is considered positive, and anything below -0.33 is negative. Any sentiment between -0.33 and 0.34 is considered neutral. 0 will be negative, 1 will be neutral, and 2 will be positive.

In [455]:
sentiment_info["sentiment"] = sentiment_info["sentiment"].apply(classify_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [453]:
def classify_sentiment(x):
    if x <= -0.33:
        return 0
    elif x >= 0.34:
        return 2
    else:
        return 1

## Train Model
We will train our model using scikit learn.

Firs, we will preprocess our data.

In [496]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
import sklearn

In [458]:
def scale_col(arr):
    scale = StandardScaler()
    scale.fit(arr)
    return scale.transform(arr)

In [459]:
sentiment_info["mfccs"] = sentiment_info["mfccs"].apply(scale_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [460]:
X_train, X_test, y_train, y_test = train_test_split(sentiment_info[["tempos", "mfccs"]], sentiment_info["sentiment"], test_size=0.33)

Next, we will fit our model.

In [570]:
def parse_labels(df):
    mfccs = df["mfccs"]
    labels = []
    length = len(df["mfccs"].iloc[0])
    for row in range(len(mfccs)):
        ls = [i * df.iloc[row]["sentiment"] for i in np.ones(length).tolist()]
        labels += ls
    return labels
    

In [576]:
def parse_features(df):
    features = []
    for row in range(len(df["mfccs"])):
        features += df["mfccs"].iloc[row].tolist()
    return features

In [None]:
X_train["sentiment"] = y_train

In [548]:
features = np.asarray(parse_features(X_train))
labels = np.asarray(parse_labels(X_train))

In [None]:
#warning: cell below takes a few hours to run

In [559]:
# model = sklearn.svm.SVC()
# model.fit(features, labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [567]:
X_test["sentiment"] = y_test

In [573]:
test_labels = parse_labels(X_test)

In [577]:
test_features = parse_features(X_test)

In [580]:
pred = model.predict(test_features)

Result is below.

In [639]:
(pred == test_labels).mean()

0.6717466609527891