In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [None]:
#clean and import data
from feature_engineering import *

#vars available:
# feature_nums - numerical features of each song
# target - whether song is a hit or not
# final - complete dataframe

In [17]:
#initial test train split
X_train, X_test, y_train, y_test = train_test_split(feature_nums, target, random_state=42)

### Adjusting For Class Imbalance

In [18]:
#SMOTE Oversampling
X_resampled, y_resampled = SMOTE().fit_sample(feature_nums, target) 
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_resampled, y_resampled, random_state=42)

In [19]:
#MINORITY OVERSAMPLING
# Separate majority and minority classes
majority = final[final['is_hit']==0]
minority = final[final['is_hit']==1]
 
# Upsample minority class
minority_upsampled = resample(minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=788,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
upsampled = pd.concat([majority, minority_upsampled])
upsampled_target = upsampled['is_hit']
upsampled_features = upsampled.drop(columns=['Unnamed: 0', 'artist', 'album', 'song','features', 
                                   'lyrics', 'isrc', 'release_date',
                                   'single_release', 'is_hit',  'listeners', 'playcount', 'playcount_percentage', 'track_no', 'unique-words'])


X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(upsampled_features, upsampled_target, random_state=42)


### Random Forest

In [20]:
#arrange all data splits in an array for Grid Search
all_data_splits = [(X_train, X_test, y_train, y_test), (X_train_smote, X_test_smote, y_train_smote, y_test_smote),(X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled)]




In [21]:
#setup param_grid for GridSearch
param_grid = { 
    'n_estimators': [20,25,30,35,40,45,50,100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5,10,15,20, 25],
    'criterion' :['gini', 'entropy'], 
    'min_samples_leaf':[1,10,20,30,40,50,100]
}

In [36]:
#run grid search for all data splits - print best criteria for each group 
for data in all_data_splits:
    try:
        CV_forest = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5, n_jobs=-1)
        CV_forest.fit(data[0], data[2])
        print(CV_forest.best_params_)
    except: 
        print('fail')


{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 100}
{'criterion': 'entropy', 'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
{'criterion': 'gini', 'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 40}


In [46]:
#no sampling
forest = RandomForestClassifier(criterion='gini',
 max_depth=10,
 max_features='log2',
 n_estimators=100,
 min_samples_leaf=1)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:
#no sampling
forest.score(X_test, y_test)

0.8535564853556485

In [48]:
#smote
forest_smote = RandomForestClassifier(criterion='gini',
 max_depth=25,
 max_features='sqrt',
 n_estimators=100)
forest_smote.fit(X_train_smote, y_train_smote)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [90]:
forest_smote.score(X_test_smote, y_test_smote)

0.8786610878661087

In [66]:
#min upsampling
forest_min = RandomForestClassifier(criterion='gini',
 max_depth=25,
 max_features='auto',
 n_estimators=100)
forest_min.fit(X_train_upsampled, y_train_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [91]:
forest_min.score(X_test_upsampled, y_test_upsampled)

0.9699248120300752

In [155]:
predictions = forest_min.predict(X_test_upsampled)

In [156]:
cm = confusion_matrix(y_test_upsampled, predictions)
f1 = f1_score(y_test_upsampled, predictions)
acc = accuracy_score(y_test_upsampled, predictions)

In [157]:
print('Confusion Matrix', cm)
print('F1 Score', f1)
print('Accuracy', acc)

Confusion Matrix [[196   9]
 [  3 191]]
F1 Score 0.9695431472081217
Accuracy 0.9699248120300752


In [158]:
#
dict(zip(list(X_train.columns), forest_min.estimators_[0].feature_importances_))

{'danceability': 0.0924973092927971,
 'energy': 0.07755833507069701,
 'loudness': 0.1583201118443566,
 'speechiness': 0.0417418074669148,
 'liveness': 0.14733431967232702,
 'tempo': 0.16466620387021555,
 'valence': 0.08440851529198654,
 'duration': 0.1263401693197076,
 'repetetivness': 0.10713322817099775}

In [159]:
names = dict(zip(list(X_train.columns), forest_min.estimators_[0].feature_importances_)).keys()

In [160]:

n = 100
scores = []
for i in range (0, n):
    forest = RandomForestClassifier(criterion='gini',
             max_depth=25,
             max_features='auto',
             n_estimators=100)
    forest.fit(X_train_upsampled, y_train_upsampled)
    f1 = f1_score(y_test_upsampled, predictions)
    acc = accuracy_score(y_test_upsampled, predictions)
    importance = forest.estimators_[0].feature_importances_
    important_features = dict(zip(names, forest.estimators_[0].feature_importances_))
    scores.append([acc, f1, important_features])

In [161]:
total = Counter({})
accuracy_avg = 0 
f1_score_avg = 0
for i in scores:
    count = Counter(i[2])
    total = total + count
    accuracy_avg += i[0]
    f1_score_avg += i[1]
    
total = dict(total)
importances = {k:total[k]/n for k in total}
print(f'Accuracy Average: {accuracy_avg/n}')
print(f'F1 Score: {f1_score_avg/n}')

Accuracy Average: 0.9699248120300734
F1 Score: 0.9695431472081235


### Support Vector Machine

In [164]:
svm_clf = svm.SVC(kernel='sigmoid', C=1000)
svm_clf.fit(X_train_upsampled, y_train_upsampled)



SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [165]:
svm_clf.score(X_test_upsampled, y_test_upsampled)

0.5137844611528822

In [151]:
predictions = svm_clf.predict(X_test_upsampled)

In [152]:
cm = confusion_matrix(y_test_upsampled, predictions)
f1 = f1_score(y_test_upsampled, predictions)
acc = accuracy_score(y_test_upsampled, predictions)

In [153]:
print('Confusion Matrix', cm)
print('F1 Score', f1)
print('Accuracy', acc)

Confusion Matrix [[192  13]
 [  3 191]]
F1 Score 0.9597989949748744
Accuracy 0.9598997493734336
