In [91]:
import seaborn as sns
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.utils import resample
from clean_data import *

In [92]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

In [93]:
#Creating a correlation matrix
corr_matrix = feature_nums.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

#Creating a list of columns to drop with correlation > .95
to_drop = [column for column in upper.columns if any(upper[column]>0.95)]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(feature_nums, target, random_state=42)

In [95]:
#SMOTE Oversampling
X_resampled, y_resampled = SMOTE().fit_sample(feature_nums, target) 
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_resampled, y_resampled, random_state=42)

In [96]:
#MINORITY OVERSAMPLING
# Separate majority and minority classes
majority = final[final['is_hit']==0]
minority = final[final['is_hit']==1]
 
# Upsample minority class
minority_upsampled = resample(minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=788,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
upsampled = pd.concat([majority, minority_upsampled])
upsampled_target = upsampled['is_hit']
upsampled_features = upsampled.drop(columns=['Unnamed: 0', 'artist', 'album', 'song','features', 
                                   'lyrics', 'isrc', 'release_date', 'age',
                                   'single_release', 'is_hit', 'today', 'listeners', 'playcount', 'list_day', 'playcount_percentage'])


X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(upsampled_features, upsampled_target, random_state=42)


In [97]:
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(X_train_upsampled, y_train_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [98]:
forest.score(X_test_upsampled, y_test_upsampled)

0.8274111675126904

In [99]:
param_grid = { 
    'n_estimators': [1,5,10,15,20,25,30,35,40,45,50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [1,2,3,4,5,6,7,8,9,10,11],
    'criterion' :['gini', 'entropy']
}

In [100]:
CV_forest = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5)
CV_forest.fit(X_train_upsampled, y_train_upsampled)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [101]:
CV_forest.best_params_

{'criterion': 'gini',
 'max_depth': 11,
 'max_features': 'sqrt',
 'n_estimators': 40}

In [102]:
forest = RandomForestClassifier(criterion='gini',
 max_depth=11,
 max_features='auto',
 n_estimators=45)
forest.fit(X_train_upsampled, y_train_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=45, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [103]:
forest.score(X_test_upsampled, y_test_upsampled)

0.9390862944162437

In [104]:
forest.estimators_[0].feature_importances_

array([0.09452596, 0.07512366, 0.10553525, 0.08999445, 0.07857197,
       0.05435281, 0.07041415, 0.09881794, 0.21892033, 0.11374347])

In [105]:
names = list(X_train.columns)

In [106]:
list(zip(names, forest.estimators_[0].feature_importances_))

[('danceability', 0.09452596183728482),
 ('energy', 0.07512366412221666),
 ('loudness', 0.1055352485232603),
 ('speechiness', 0.0899944496357798),
 ('liveness', 0.07857196719345824),
 ('tempo', 0.05435281060219241),
 ('valence', 0.07041415049480504),
 ('duration', 0.0988179447277689),
 ('track_no', 0.2189203289607422),
 ('unique-words', 0.11374347390249172)]

In [107]:
# n = 1000
# scores = []
# for i in range (0, n):
#     forest = RandomForestClassifier(criterion='gini',
#  max_depth=11,
#  max_features='auto',
#  n_estimators=45)
#     forest.fit(X_train_upsampled, y_train_upsampled)
#     score = forest.score(X_test_upsampled, y_test_upsampled)
#     importance = forest.estimators_[0].feature_importances_
#     important_features = list(zip(names, forest.estimators_[0].feature_importances_))
#     scores.append([score, important_features])

In [108]:
score = 0
danceability = 0
loudness = 0 
speechiness =0
liveness =0
tempo =0
valence =0
duration =0
track_no =0
unique_words =0
    
for i in scores: 
    
    score += i[0]
    danceability += i[1][0][1]
    loudness += i[1][1][1]
    speechiness += i[1][2][1]
    liveness += i[1][3][1]
    tempo += i[1][4][1]
    valence += i[1][5][1]
    duration += i[1][6][1]
    track_no += i[1][7][1]
    unique_words += i[1][8][1]

In [109]:
rf_data = {'score': score/n, 'danceability': danceability/n, 'speechiness':speechiness/n, 'liveness': liveness/n, 'tempo': tempo/n, 'valence' : valence/n, 
       'duration': duration/n, 'track': track_no/n, 'unique_words': unique_words/n}



In [110]:
svm_clf = svm.SVC(kernel='linear', class_weight='balanced')
svm_clf.fit(X_train_upsampled, y_train_upsampled)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [111]:
svm_clf.score(X_test_upsampled, y_test_upsampled)

0.6802030456852792

In [112]:
rf_data

{'score': 0.9433375634517758,
 'danceability': 0.10108781239426166,
 'speechiness': 0.11435277162021616,
 'liveness': 0.08415741297377649,
 'tempo': 0.09086308238507086,
 'valence': 0.08712643405095515,
 'duration': 0.08483018578162915,
 'track': 0.09349168954734303,
 'unique_words': 0.1644195250731228}

In [113]:
from sklearn.tree import export_graphviz

In [114]:
estimator = forest.estimators_[5]