In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from clean_data import *
from collections import Counter
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [20]:
#initial test train split
X_train, X_test, y_train, y_test = train_test_split(feature_nums, target, random_state=42)

In [21]:
#SMOTE Oversampling
X_resampled, y_resampled = SMOTE().fit_sample(feature_nums, target) 
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_resampled, y_resampled, random_state=42)

In [22]:
#MINORITY OVERSAMPLING
# Separate majority and minority classes
majority = final[final['is_hit']==0]
minority = final[final['is_hit']==1]
 
# Upsample minority class
minority_upsampled = resample(minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=788,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
upsampled = pd.concat([majority, minority_upsampled])
upsampled_target = upsampled['is_hit']
upsampled_features = upsampled.drop(columns=['Unnamed: 0', 'artist', 'album', 'song','features', 
                                   'lyrics', 'isrc', 'release_date',
                                   'single_release', 'is_hit',  'listeners', 'playcount', 'playcount_percentage', 'track_no', 'unique-words'])


X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(upsampled_features, upsampled_target, random_state=42)


In [23]:
#arrange all data splits in an array for Grid Search
all_data_splits = [(X_train, X_test, y_train, y_test), (X_train_smote, X_test_smote, y_train_smote, y_test_smote),(X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled)]

In [24]:
#setup param_grid for GridSearch
param_grid = { 
    'n_estimators': [20,25,30,35,40,45,50,100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5,10,15,20, 25],
    'criterion' :['gini', 'entropy'], 
    'min_samples_leaf':[1,10,20,30,40,50,100]
}

In [132]:
#run grid search for all data splits - print best criteria for each group 
for data in all_data_splits: 
    try:
        CV_forest = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5, n_jobs=-1)
        CV_forest.fit(data[0], data[2])
        print(CV_forest.best_params_)
    except: 
        pass


{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 30}
{'criterion': 'gini', 'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 30}
{'criterion': 'gini', 'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 20}


In [43]:
#no sampling
forest = RandomForestClassifier(criterion='gini',
 max_depth=8,
 max_features='auto',
 n_estimators=30,
 min_samples_leaf=1)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=11, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [44]:
#no sampling
forest.score(X_test, y_test)

0.8577405857740585

In [45]:
#smote
forest = RandomForestClassifier(criterion='gini',
 max_depth=11,
 max_features='sqrt',
 n_estimators=30)
forest.fit(X_train_smote, y_train_smote)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
forest.score(X_test_smote, y_test_smote)

0.8044554455445545

In [54]:
#min upsampling
forest = RandomForestClassifier(criterion='gini',
 max_depth=11,
 max_features='auto',
 n_estimators=100)
forest.fit(X_train_upsampled, y_train_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [104]:
forest.score(X_test_upsampled, y_test_upsampled)

0.9523809523809523

In [107]:
predictions = forest.predict(X_test_upsampled)

In [112]:
cm =confusion_matrix(y_test_upsampled, predictions)
f1 = f1_score(y_test_upsampled, predictions)
acc = accuracy_score(y_test_upsampled, predictions)

In [113]:
print(cm , f1 , acc)

[[189  16]
 [  3 191]] 0.9526184538653366 0.9523809523809523


In [103]:
dict(zip(list(X_train.columns), forest.estimators_[0].feature_importances_))

{'danceability': 0.08049650285224573,
 'energy': 0.11185365527670522,
 'loudness': 0.14373310466331105,
 'speechiness': 0.04965632338213511,
 'liveness': 0.0823668270086988,
 'tempo': 0.20844345570606623,
 'valence': 0.1262673706790733,
 'duration': 0.12136662628860195,
 'repetetivness': 0.0758161341431626}

In [118]:
n = 100
scores = []
for i in range (0, n):
    forest = RandomForestClassifier(criterion='gini',
             max_depth=11,
             max_features='auto',
             n_estimators=100)
    forest.fit(X_train_upsampled, y_train_upsampled)
    f1 = f1_score(y_test_upsampled, predictions)
    acc = accuracy_score(y_test_upsampled, predictions)
    importance = forest.estimators_[0].feature_importances_
    important_features = dict(zip(names, forest.estimators_[0].feature_importances_))
    scores.append([acc, f1, important_features])

In [119]:
total = Counter({})
accuracy_avg = 0 
f1_score_avg = 0
for i in scores:
    count = Counter(i[2])
    total = total + count
    accuracy_avg += i[0]
    f1_score_avg += i[1]
    
total = dict(total)
importances = {k:total[k]/n for k in total}
print(f'Accuracy Average: {accuracy_avg/n}')
print(f'F1 Score: {f1_score_avg/n}')
importances

Accuracy Average: 0.9523809523809501
F1 Score: 0.9526184538653374


{'danceability': 0.11470150136997795,
 'energy': 0.11094961006270677,
 'loudness': 0.12995929248869084,
 'speechiness': 0.10841303592862891,
 'liveness': 0.0939444106857131,
 'tempo': 0.10640764573624706,
 'valence': 0.11110573790944196,
 'duration': 0.11024980332821191,
 'repetetivness': 0.11426896249038145}

In [28]:
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
svm_clf.score(X_test, y_test)

0.8535564853556485