# Get the MVP - the mood of the song

In [91]:
import pandas as pd
import numpy as np

from textblob import TextBlob

In [92]:
data = pd.read_csv('./data_top10c_more_lyrics.csv')

In [93]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,Position,Streams,Track Name,Artist,ID,Date,Year,Month,Day,Country,Region,Lyrics,Acousticness,Energy,Instrumentalness,Mode,Tempo,Valence
0,0,177,40381,Bye Bye Bye,*NSYNC,4r8lRYnoOGdEi6YyI5OC1o,2017-10-05,2017,10,5,gb,eu,"hey, hey bye bye bye, bye bye bye bye i'm doi...",0.0408,0.928,0.00104,0.0,172.656,0.879
1,1,151,24132,"Merry Christmas, Happy Holidays",*NSYNC,15coTBAzEN1bOeipoNDZAR,2017-12-23,2017,12,23,it,eu,merry christmas and happy holidays merry chris...,0.103,0.939,0.0,1.0,105.003,0.756
2,2,78,49766,"Merry Christmas, Happy Holidays",*NSYNC,15coTBAzEN1bOeipoNDZAR,2017-12-24,2017,12,24,it,eu,merry christmas and happy holidays merry chris...,0.103,0.939,0.0,1.0,105.003,0.756


Filter out the columns that wont change (for each song/one song per row)-> predict mood (happy , neutral, sad) for song.<BR />
The columns will be:
- Track Name (general info)
- Artist (general info)
- ID (general info)
- Lyrics (we need to performe NLP)
- Acousticness (make sure numerical)
- Energy (make sure numerical)
- Instrumentalness (make sure numerical)
- Mode (make sure numerical)
- Tempo (make sure numerical)
- Valence (make sure numerical)

In [94]:
# Drop rows that are duplicates and keep only one row for each song
data_per_song = data.drop_duplicates(subset=['Track Name'], keep='first')

In [95]:
data_per_song.head(3)

Unnamed: 0.1,Unnamed: 0,Position,Streams,Track Name,Artist,ID,Date,Year,Month,Day,Country,Region,Lyrics,Acousticness,Energy,Instrumentalness,Mode,Tempo,Valence
0,0,177,40381,Bye Bye Bye,*NSYNC,4r8lRYnoOGdEi6YyI5OC1o,2017-10-05,2017,10,5,gb,eu,"hey, hey bye bye bye, bye bye bye bye i'm doi...",0.0408,0.928,0.00104,0.0,172.656,0.879
1,1,151,24132,"Merry Christmas, Happy Holidays",*NSYNC,15coTBAzEN1bOeipoNDZAR,2017-12-23,2017,12,23,it,eu,merry christmas and happy holidays merry chris...,0.103,0.939,0.0,1.0,105.003,0.756
43,43,147,43037,Douce Nuit,-M-,4EOJWkvkVDpkZrhC8iTDsI,2017-12-24,2017,12,24,fr,eu,,0.914,0.227,0.163,1.0,81.887,0.0498


In [96]:
# Drop all columns that might change per song
mvp_data = data_per_song.drop(['Unnamed: 0', 'Position', 'Streams', 'Date', 'Year', 'Month', 'Day', 'Country', 'Region'], axis=1)

In [98]:
mvp_data.head(3)

Unnamed: 0,Track Name,Artist,ID,Lyrics,Acousticness,Energy,Instrumentalness,Mode,Tempo,Valence
0,Bye Bye Bye,*NSYNC,4r8lRYnoOGdEi6YyI5OC1o,"hey, hey bye bye bye, bye bye bye bye i'm doi...",0.0408,0.928,0.00104,0.0,172.656,0.879
1,"Merry Christmas, Happy Holidays",*NSYNC,15coTBAzEN1bOeipoNDZAR,merry christmas and happy holidays merry chris...,0.103,0.939,0.0,1.0,105.003,0.756
43,Douce Nuit,-M-,4EOJWkvkVDpkZrhC8iTDsI,,0.914,0.227,0.163,1.0,81.887,0.0498


In [99]:
mvp_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6919 entries, 0 to 578929
Data columns (total 10 columns):
Track Name          6919 non-null object
Artist              6919 non-null object
ID                  6919 non-null object
Lyrics              4190 non-null object
Acousticness        6918 non-null float64
Energy              6918 non-null float64
Instrumentalness    6918 non-null float64
Mode                6918 non-null float64
Tempo               6918 non-null float64
Valence             6918 non-null float64
dtypes: float64(6), object(4)
memory usage: 594.6+ KB


In [None]:
# seem to be the right data type for the last 6 columns

In [100]:
len(mvp_data)

6919

In [101]:
mvp_data['Lyrics'].isnull().sum()

2729

In [102]:
2729/6919

0.39442115912704145

I am missing lyrics for 39% of my songs (I have lyrics for 61% of my songs)

In [103]:
# Turn mvp_data['Lyrics'] into sting
mvp_data['Lyrics'] = mvp_data['Lyrics'].astype(str)

Source: https://stackoverflow.com/questions/43485469/apply-textblob-in-for-each-row-of-a-dataframe

In [104]:
def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment
    except:
        return None

mvp_data['pol_sub'] = mvp_data['Lyrics'].apply(sentiment_calc)

In [105]:
# look at the new column with Polarity and Subjectivity from TextBlob
# Polarity (-1 to +1) (negative, positive)
# Subjectivity (0 to 1) (objective, subjective)
mvp_data['pol_sub'].head(3)

0     (-0.044454619454619454, 0.5908017908017905)
1        (0.5831501831501833, 0.6706959706959708)
43                                     (0.0, 0.0)
Name: pol_sub, dtype: object

In [106]:
mvp_data.head(3)

Unnamed: 0,Track Name,Artist,ID,Lyrics,Acousticness,Energy,Instrumentalness,Mode,Tempo,Valence,pol_sub
0,Bye Bye Bye,*NSYNC,4r8lRYnoOGdEi6YyI5OC1o,"hey, hey bye bye bye, bye bye bye bye i'm doi...",0.0408,0.928,0.00104,0.0,172.656,0.879,"(-0.044454619454619454, 0.5908017908017905)"
1,"Merry Christmas, Happy Holidays",*NSYNC,15coTBAzEN1bOeipoNDZAR,merry christmas and happy holidays merry chris...,0.103,0.939,0.0,1.0,105.003,0.756,"(0.5831501831501833, 0.6706959706959708)"
43,Douce Nuit,-M-,4EOJWkvkVDpkZrhC8iTDsI,,0.914,0.227,0.163,1.0,81.887,0.0498,"(0.0, 0.0)"


In [107]:
# Change order of the columns
mvp_data = mvp_data[['Track Name', 'Artist', 'ID', 'Lyrics', 'pol_sub', 'Acousticness', 'Energy', 'Instrumentalness',
                     'Mode', 'Tempo', 'Valence']]

In [108]:
mvp_data.head(3)

Unnamed: 0,Track Name,Artist,ID,Lyrics,pol_sub,Acousticness,Energy,Instrumentalness,Mode,Tempo,Valence
0,Bye Bye Bye,*NSYNC,4r8lRYnoOGdEi6YyI5OC1o,"hey, hey bye bye bye, bye bye bye bye i'm doi...","(-0.044454619454619454, 0.5908017908017905)",0.0408,0.928,0.00104,0.0,172.656,0.879
1,"Merry Christmas, Happy Holidays",*NSYNC,15coTBAzEN1bOeipoNDZAR,merry christmas and happy holidays merry chris...,"(0.5831501831501833, 0.6706959706959708)",0.103,0.939,0.0,1.0,105.003,0.756
43,Douce Nuit,-M-,4EOJWkvkVDpkZrhC8iTDsI,,"(0.0, 0.0)",0.914,0.227,0.163,1.0,81.887,0.0498


In [None]:
mvp_data['pol_sub'] = mvp_data['pol_sub'].to_string()

In [None]:
mvp_data['pol_sub'] = mvp_data['pol_sub'].str.replace('(', '')

In [None]:
mvp_data['pol_sub'] = mvp_data['pol_sub'].str.replace(',', '')

In [90]:
# the computer ran out of memory here......
# Split the pol_sub column into 2 columns
# df['A'], df['B'] = 
mvp_data['pol_sub'][0].str.split(' ')

0         [0, , , , , , , , , , , -0.044454619454619454,...
1         [0, , , , , , , , , , , -0.044454619454619454,...
43        [0, , , , , , , , , , , -0.044454619454619454,...
44        [0, , , , , , , , , , , -0.044454619454619454,...
47        [0, , , , , , , , , , , -0.044454619454619454,...
48        [0, , , , , , , , , , , -0.044454619454619454,...
50        [0, , , , , , , , , , , -0.044454619454619454,...
64        [0, , , , , , , , , , , -0.044454619454619454,...
71        [0, , , , , , , , , , , -0.044454619454619454,...
97        [0, , , , , , , , , , , -0.044454619454619454,...
101       [0, , , , , , , , , , , -0.044454619454619454,...
162       [0, , , , , , , , , , , -0.044454619454619454,...
199       [0, , , , , , , , , , , -0.044454619454619454,...
236       [0, , , , , , , , , , , -0.044454619454619454,...
271       [0, , , , , , , , , , , -0.044454619454619454,...
343       [0, , , , , , , , , , , -0.044454619454619454,...
367       [0, , , , , , , , , , , -0.044