In [9]:
import seaborn as sns
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.utils import resample
from clean_data import *

In [10]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

In [11]:
#Creating a correlation matrix
corr_matrix = feature_nums.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

#Creating a list of columns to drop with correlation > .95
to_drop = [column for column in upper.columns if any(upper[column]>0.95)]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(feature_nums, target, random_state=42)

In [13]:
#SMOTE Oversampling
X_resampled, y_resampled = SMOTE().fit_sample(feature_nums, target) 
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_resampled, y_resampled, random_state=42)

In [14]:
#MINORITY OVERSAMPLING
# Separate majority and minority classes
majority = final[final['is_hit']==0]
minority = final[final['is_hit']==1]
 
# Upsample minority class
minority_upsampled = resample(minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=788,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
upsampled = pd.concat([majority, minority_upsampled])
upsampled_target = upsampled['is_hit']
upsampled_features = upsampled.drop(columns=['Unnamed: 0', 'artist', 'album', 'song','features', 
                                   'lyrics', 'isrc', 'release_date', 'age',
                                   'single_release', 'is_hit', 'today', 'listeners', 'playcount', 'list_day', 'playcount_percentage'])


X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(upsampled_features, upsampled_target, random_state=42)


In [23]:
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(X_train_upsampled, y_train_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
forest.score(X_test_upsampled, y_test_upsampled)

0.8197969543147208

In [38]:
param_grid = { 
    'n_estimators': [1,5,10,15,20,25,30,35,40,45,50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [1,2,3,4,5,6,7,8,9,10,11],
    'criterion' :['gini', 'entropy']
}

In [39]:
CV_forest = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5)
CV_forest.fit(X_train_upsampled, y_train_upsampled)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
CV_forest.best_params_

{'criterion': 'gini',
 'max_depth': 11,
 'max_features': 'auto',
 'n_estimators': 45}

In [44]:
forest = RandomForestClassifier(criterion='gini',
 max_depth=11,
 max_features='auto',
 n_estimators=45)
forest.fit(X_train_upsampled, y_train_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=45, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
forest.score(X_test_upsampled, y_test_upsampled)

0.9568527918781726

In [46]:
forest.estimators_[0].feature_importances_

array([0.06191815, 0.18489429, 0.09522104, 0.10702034, 0.07060748,
       0.07978163, 0.09146724, 0.09804329, 0.17922273, 0.03182381])

In [48]:
names = list(X_train.columns)

In [49]:
list(zip(names, forest.estimators_[0].feature_importances_))

[('danceability', 0.061918150671408434),
 ('energy', 0.18489429489718584),
 ('loudness', 0.09522103545351658),
 ('speechiness', 0.10702033786629651),
 ('liveness', 0.07060748222428287),
 ('tempo', 0.07978163384048768),
 ('valence', 0.09146723938886844),
 ('duration', 0.09804328609205043),
 ('track_no', 0.17922273163744581),
 ('unique-words', 0.03182380792845728)]

In [53]:
n = 1000
scores = []
for i in range (0, n):
    forest = RandomForestClassifier(criterion='gini',
 max_depth=11,
 max_features='auto',
 n_estimators=45)
    forest.fit(X_train_upsampled, y_train_upsampled)
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    score = forest.score(X_test, y_test)
    importance = forest.estimators_[0].feature_importances_
    important_features = list(zip(names, forest.estimators_[0].feature_importances_))
    scores.append([score, important_features])

In [54]:
score = 0
danceability = 0
loudness = 0 
speechiness =0
liveness =0
tempo =0
valence =0
duration =0
track_no =0
unique_words =0
    
for i in scores: 
    
    score += i[0]
    danceability += i[1][0][1]
    loudness += i[1][1][1]
    speechiness += i[1][2][1]
    liveness += i[1][3][1]
    tempo += i[1][4][1]
    valence += i[1][5][1]
    duration += i[1][6][1]
    track_no += i[1][7][1]
    unique_words += i[1][8][1]

In [57]:
rf_data = {'score': score/n, 'danceability': danceability/n, 'speechiness':speechiness/n, 'liveness': liveness/n, 'tempo': tempo/n, 'valence' : valence/n, 
       'duration': duration/n, 'track': track_no/n, 'unique_words': unique_words/n}



In [136]:
svm_clf = svm.SVC(kernel='linear', class_weight='balanced')
svm_clf.fit(X_train_upsampled, y_train_upsampled)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [137]:
svm_clf.score(X_test_upsampled, y_test_upsampled)

0.6802030456852792

In [58]:
rf_data

{'score': 0.9811158798283226,
 'danceability': 0.09984516252440036,
 'speechiness': 0.11823723225365909,
 'liveness': 0.085412416800855,
 'tempo': 0.09055071256662733,
 'valence': 0.086432859399601,
 'duration': 0.08645431699773588,
 'track': 0.09076274512318237,
 'unique_words': 0.16466914903389165}