In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import datetime
from datetime import date
import re
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [2]:
spotify_lyrics = pd.read_csv('./lyricspotify.csv')
missed = pd.read_csv('./lastfmmissed.csv')
lastfm = pd.read_csv('./lastfm.csv')

In [3]:
missed = missed[missed.columns[-8:]]

In [4]:
lastfm = lastfm[lastfm.columns[1:]]

In [5]:
last_fm_total = pd.concat([lastfm,missed])

In [6]:
last_fm_total = last_fm_total.rename(columns={'track-title':'song'})

In [7]:
last_fm_total['artist'] = last_fm_total['artist'].apply(lambda x: x.lower())
last_fm_total['artist'] = last_fm_total['artist'].apply(lambda x: x.replace(' ', '-'))

In [8]:
spotify_lyrics.drop_duplicates(subset=['song'], keep='first', inplace=True)

In [9]:
final = spotify_lyrics.merge(last_fm_total, on=['song', 'artist'])

In [10]:
final['playcount_percentage'] = final.groupby('album')['playcount'].apply(lambda x: x/(x.sum()))

In [11]:
final['is_hit'] = final['playcount_percentage'] >= .15

In [12]:
final = final[~final['danceability'].isna()]
final = final[~final['lyrics'].isna()]

In [13]:
def count_unique_words(lyrics):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(lyrics) 
    no_stop_words_lyrics = [w for w in word_tokens if not w in stop_words] 
    unique = set(no_stop_words_lyrics)
    return len(unique)
    

In [14]:
for idx, row in final.iterrows():
  final.loc[idx, 'unique-words'] = count_unique_words(row['lyrics'])

In [15]:
final = final[~final.album.str.contains("Hits")]
final = final[~final.artist.str.contains("the-beatles")]
final = final[~final.artist.str.contains('michael-jackson')]

In [16]:
#GET RID OF NULL DATES
final = final[final.release_date.notnull()]
for idx, row in final.iterrows():
    final['today']=datetime.datetime.today().strftime('%Y-%m-%d')

In [17]:
final['age']=np.nan
for idx, row in final.iterrows():
    date=None
    today=None
    try:
        date = datetime.datetime.strptime(final.loc[idx,'release_date'],"%Y-%m-%d")
        today = datetime.datetime.strptime(final.loc[idx,'today'],"%Y-%m-%d")
        final.loc[idx,'age']=abs((today-date).days)
    except:
        pass

In [18]:
final['list_day']=np.nan
for idx,row in final.iterrows():
    days=None
    listnr=None
    days=final.loc[idx,'age']
    listnr=final.loc[idx, 'listeners']
    final.loc[idx,'list_day']=listnr/days

In [19]:
final = final[~final['age'].isna()]

In [20]:
feature_nums = final.drop(columns=['Unnamed: 0', 'artist', 'album', 'song','features', 
                                   'lyrics', 'isrc', 'release_date', 'age',
                                   'single_release', 'is_hit', 'today', 'listeners', 'playcount', 'list_day', 'playcount_percentage'])


In [21]:
target = final['is_hit']

In [22]:
#Creating a correlation matrix
corr_matrix = feature_nums.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

#Creating a list of columns to drop
to_drop = [column for column in upper.columns if any(upper[column]>0.95)]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(feature_nums, target, random_state=42)

In [24]:
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
forest.score(X_test, y_test)

0.8626609442060086

In [26]:
forest.estimators_[0].feature_importances_

array([0.25895269, 0.        , 0.0397694 , 0.05007362, 0.        ,
       0.14768945, 0.10083111, 0.14829326, 0.1717046 , 0.08268586])

In [28]:
names = list(X_train.columns)

In [29]:
list(zip(names, forest.estimators_[0].feature_importances_))

[('danceability', 0.25895269241316027),
 ('energy', 0.0),
 ('loudness', 0.039769404161927785),
 ('speechiness', 0.050073615313070606),
 ('liveness', 0.0),
 ('tempo', 0.14768944767910916),
 ('valence', 0.10083110914070047),
 ('duration', 0.1482932643016889),
 ('track_no', 0.17170460244408353),
 ('unique-words', 0.08268586454625931)]

In [None]:
scores = []
for i in range (0, n):
    forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
    forest.fit(X_train, y_train)
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    score = forest.score(X_test, y_test)
    importance = forest.estimators_[0].feature_importances_
    important_features = list(zip(names, forest.estimators_[0].feature_importances_))
    scores.append([score, important_features])

In [None]:
score = 0
danceability = 0
loudness = 0 
speechiness =0
liveness =0
tempo =0
valence =0
duration =0
track_no =0
unique_words =0
    
for i in scores: 
    
    score += i[0]
    danceability += i[1][0][1]
    loudness += i[1][1][1]
    speechiness += i[1][2][1]
    liveness += i[1][3][1]
    tempo += i[1][4][1]
    valence += i[1][5][1]
    duration += i[1][6][1]
    track_no += i[1][7][1]
    unique_words += i[1][8][1]

In [None]:
score/10

In [None]:
print({'score': score/n, 'danceability': danceability/n, 'speechiness':speechiness/n, 'liveness': liveness/n, 'tempo': tempo/n, 'valence' : valence/n, 
       'duration': duration/n, 'track': track_no/n, 'unique_words': unique_words/n})




In [55]:
n = 1000