In [108]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.multioutput import ClassifierChain

In [109]:
songs_dataset = pd.read_json('MasterSongList.json')

In [110]:
audio_feature_list = []
for audio_feature in songs_dataset['audio_features']:
    audio_feature_list.append(audio_feature)

In [111]:
audio_feature_header = ['key','energy','liveliness','tempo','speechiness','acousticness','instrumentalness','time_signature'
                         ,'duration','loudness','valence','danceability','mode','time_signature_confidence','tempo_confidence'
                         ,'key_confidence','mode_confidence']

In [112]:
df_feature =  pd.DataFrame(audio_feature_list, columns = audio_feature_header)
df_feature.loc[:,].dropna(axis=0,how='any',inplace=True)

In [113]:
df_feature.head()
df_feature.shape

(30296, 17)

In [114]:
df_feature['lyrics_features'] = songs_dataset['lyrics_features']
df_feature['moods'] = songs_dataset['moods']

Clearn Lyrics

In [115]:
lyrics_list = []
lyrics_column = df_feature['lyrics_features']
for lyrics in lyrics_column:
    lyrics_list.append(' '.join(lyrics))

#checked
print(len(lyrics_list))

30296


In [116]:
# Clean word function from lab3
import string
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

translator = str.maketrans('', '', punctuation)
stemmer = SnowballStemmer('english')

def clean_text(raw_text):
    clean_words = []
    raw_text.lower()
    raw_text = raw_text.translate(translator)
    split_words = raw_text.split()
    
    for word in split_words:
        if word not in ENGLISH_STOP_WORDS:
            word = stemmer.stem(word)
            clean_words.append(word)
            
    return ' '.join(clean_words)

In [117]:
lyrics_list_cleaned = []
for lyrics in lyrics_list:
    lyrics_list_cleaned.append(clean_text(lyrics))

In [118]:
df_feature['lyrics_features'] = lyrics_list_cleaned

In [119]:
df_feature

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,lyrics_features,moods
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,0.000001,0.0,4.0,218.30667,-3.890,0.752186,0.726920,0.552,0.541,1.000,1.000,oppa gangnam style gangnam style najeneun ttas...,"[energetic, motivational]"
1,6.0,0.745704,0.119955,100.008,0.046255,0.026230,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.000,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,5.0,0.709932,0.231455,130.030,0.121741,0.036662,0.000000,0.0,4.0,232.46104,-5.150,0.374390,0.704729,0.565,0.565,0.743,1.000,parti rock yeah woo let s parti rock hous toni...,"[happy, celebratory, rowdy]"
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.000000,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.000,0.742,alagamun lan weh wakun heya hanun gon alagamun...,"[happy, energetic, celebratory]"
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.860,0.585630,0.730711,0.271,0.324,0.822,1.000,j lo s new generat mr worldwid parti peopl flo...,[energetic]
5,8.0,0.733856,0.093043,174.952,0.058137,0.293325,0.000000,0.0,4.0,188.94281,-4.312,0.953424,0.785102,0.177,0.404,0.826,1.000,today don t feel like do just wanna lay bed do...,"[happy, sprightly]"
6,8.0,0.777375,0.054104,104.946,0.029302,0.130350,0.000000,1.0,4.0,228.29333,-5.112,0.525632,0.729051,0.817,0.672,0.394,0.963,s start heart reach fever pitch s bring dark f...,[warm]
7,7.0,0.585564,0.108297,120.014,0.038924,0.011707,0.000005,1.0,4.0,193.57333,-6.583,0.622176,0.781822,0.970,0.861,0.792,1.000,threw wish don t ask ll tell look fell way tra...,"[energetic, motivational]"
8,0.0,0.418212,0.105322,129.054,0.045461,0.548385,0.000120,0.0,4.0,248.28000,-7.284,0.440790,0.841824,0.011,0.303,0.843,1.000,think like said felt happi die told right felt...,"[seductive, nocturnal]"
9,4.0,0.814030,0.079196,124.991,0.072440,0.005355,0.000000,1.0,4.0,198.25333,-1.985,0.838512,0.700860,0.429,0.375,0.740,1.000,don t know turn head walk o don t need make co...,"[happy, celebratory]"


Lyrics features and audio features

# Audio features moods

In [120]:
y = df_feature['moods']
df_audio_moods = df_feature.drop(['lyrics_features', 'moods'], axis = 1)
df_audio_moods.reset_index(drop = True, inplace = True)
X = df_audio_moods
X.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [121]:
y

0                             [energetic, motivational]
1                                               [happy]
2                           [happy, celebratory, rowdy]
3                       [happy, energetic, celebratory]
4                                           [energetic]
5                                    [happy, sprightly]
6                                                [warm]
7                             [energetic, motivational]
8                                [seductive, nocturnal]
9                                  [happy, celebratory]
10                                          [seductive]
11                             [energetic, celebratory]
12                          [happy, celebratory, campy]
13                            [energetic, motivational]
14                                          [energetic]
15                                 [happy, celebratory]
16                                              [happy]
17                                           [so

In [122]:
mlb = MultiLabelBinarizer()

In [123]:
y_audio_moods = mlb.fit_transform(y)

In [124]:
print(y_audio_moods)
print(y_audio_moods.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(30296, 32)


In [125]:
print(mlb.classes_)
print(y_audio_moods[0])
print(y.iloc[0])

['aggressive' 'angsty' 'atmospheric' 'campy' 'celebratory' 'classy'
 'cocky' 'cold' 'earthy' 'energetic' 'funky' 'gloomy' 'happy' 'hypnotic'
 'introspective' 'lush' 'mellow' 'motivational' 'nocturnal' 'raw' 'rowdy'
 'sad' 'seductive' 'sexual' 'soothing' 'spacey' 'sprightly' 'sweet'
 'trashy' 'trippy' 'visceral' 'warm']
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['energetic', 'motivational']


In [126]:
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)

In [127]:
y = y_audio_moods
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [128]:
model_rf = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model_rf.fit(X_train, y_train)
model_rf_predictions = model_rf.predict(X_test)
print(classification_report(y_test, model_rf_predictions))

             precision    recall  f1-score   support

          0       0.65      0.22      0.33       396
          1       0.56      0.06      0.10       268
          2       0.28      0.02      0.03       392
          3       0.00      0.00      0.00       181
          4       0.32      0.03      0.06       351
          5       0.50      0.01      0.01       277
          6       0.11      0.01      0.01       316
          7       0.18      0.01      0.03       277
          8       0.56      0.02      0.03       284
          9       0.43      0.05      0.10       591
         10       0.49      0.08      0.14       624
         11       1.00      0.00      0.01       230
         12       0.14      0.01      0.01       459
         13       0.50      0.01      0.03       217
         14       0.20      0.00      0.01       354
         15       0.36      0.01      0.02       411
         16       0.31      0.02      0.04       867
         17       0.10      0.00      0.01   

  'precision', 'predicted', average, warn_for)


Need Optimize

# Lyrics features moods

In [129]:
lyrics_features = df_feature['lyrics_features']
moods =  df_feature['moods']
df_lyrics_moods = pd.concat([lyrics_features, moods], axis=1)

In [130]:
df_lyrics_moods.dropna(how='any', inplace = True)
df_lyrics_moods.reset_index(drop = True, inplace = True)

In [131]:
df_lyrics_moods.head()

Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,"[energetic, motivational]"
1,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,parti rock yeah woo let s parti rock hous toni...,"[happy, celebratory, rowdy]"
3,alagamun lan weh wakun heya hanun gon alagamun...,"[happy, energetic, celebratory]"
4,j lo s new generat mr worldwid parti peopl flo...,[energetic]


In [132]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_lyrics_moods['moods'])

In [133]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [134]:
print(mlb.classes_)
print(y[3])

['aggressive' 'angsty' 'atmospheric' 'campy' 'celebratory' 'classy'
 'cocky' 'cold' 'earthy' 'energetic' 'funky' 'gloomy' 'happy' 'hypnotic'
 'introspective' 'lush' 'mellow' 'motivational' 'nocturnal' 'raw' 'rowdy'
 'sad' 'seductive' 'sexual' 'soothing' 'spacey' 'sprightly' 'sweet'
 'trashy' 'trippy' 'visceral' 'warm']
[0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [135]:
X = df_lyrics_moods['lyrics_features']

In [136]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(X)
print(tf_idf)

  (0, 41100)	0.22904668582491755
  (0, 22918)	0.34712706604442706
  (0, 54691)	0.16079154686002542
  (0, 38612)	0.04083847835816788
  (0, 58705)	0.04083847835816788
  (0, 28489)	0.02041923917908394
  (0, 63440)	0.15269779054994503
  (0, 31135)	0.04083847835816788
  (0, 25582)	0.02041923917908394
  (0, 63469)	0.02041923917908394
  (0, 2725)	0.05597523195428372
  (0, 45739)	0.02041923917908394
  (0, 28583)	0.04083847835816788
  (0, 5040)	0.03817444763748626
  (0, 40850)	0.04083847835816788
  (0, 52027)	0.03928012028579997
  (0, 58728)	0.02041923917908394
  (0, 23530)	0.09820030071449994
  (0, 5120)	0.02041923917908394
  (0, 38737)	0.10984826869535502
  (0, 49707)	0.20419239179083942
  (0, 39211)	0.02041923917908394
  (0, 51948)	0.02041923917908394
  (0, 29782)	0.018308044782559173
  (0, 62728)	0.02041923917908394
  :	:
  (30281, 37474)	0.06666541138815388
  (30281, 2308)	0.8515629629375522
  (30291, 26342)	0.7403726090459047
  (30291, 4559)	0.057497533147698704
  (30291, 60460)	0.0260677

In [137]:
# model_rf = RandomForestClassifier(n_estimators=5,min_samples_split=2, max_features='log2')
X_train, X_test, y_train, y_test = train_test_split(tf_idf, y, test_size=0.2)
# model_rf.fit(X_train, y_train)
# model_rf_predictions = model_rf.predict(X_test)

# print(classification_report(y_test, model_rf_predictions))

Need optimize

In [138]:
#try to use ClassifierChain + RFC

In [139]:
chain = ClassifierChain(RandomForestClassifier())

In [140]:
chain.fit(X_train, y_train)
predictions = chain.predict(X_test)

In [141]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.30      0.01      0.02       385
          1       0.00      0.00      0.00       276
          2       0.14      0.00      0.00       419
          3       0.00      0.00      0.00       183
          4       0.33      0.01      0.02       359
          5       0.60      0.01      0.02       289
          6       0.29      0.02      0.04       305
          7       0.10      0.00      0.01       283
          8       0.25      0.00      0.01       311
          9       0.42      0.02      0.04       605
         10       0.50      0.02      0.04       598
         11       0.00      0.00      0.00       244
         12       0.25      0.01      0.02       438
         13       0.00      0.00      0.00       183
         14       0.00      0.00      0.00       323
         15       0.33      0.01      0.01       384
         16       0.30      0.01      0.02       827
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


In [142]:
pickle.dump(chain, open('moods_features.pickle', 'wb'))

In [143]:
#testing 
test = 1
print(mlb.classes_)
print(chain.predict(tf_idf[test]))
pd.DataFrame(data = chain.predict(tf_idf[test]),columns = mlb.classes_)

['aggressive' 'angsty' 'atmospheric' 'campy' 'celebratory' 'classy'
 'cocky' 'cold' 'earthy' 'energetic' 'funky' 'gloomy' 'happy' 'hypnotic'
 'introspective' 'lush' 'mellow' 'motivational' 'nocturnal' 'raw' 'rowdy'
 'sad' 'seductive' 'sexual' 'soothing' 'spacey' 'sprightly' 'sweet'
 'trashy' 'trippy' 'visceral' 'warm']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]


Unnamed: 0,aggressive,angsty,atmospheric,campy,celebratory,classy,cocky,cold,earthy,energetic,...,seductive,sexual,soothing,spacey,sprightly,sweet,trashy,trippy,visceral,warm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
