In [86]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

### Loading the modified dataset

In [87]:
df = pd.read_csv('/Users/haochenyang/Desktop/EECS545/Project/data_merged_quartile.csv')
#list(df.columns)
#del df['mode']
#del df['key_6_yr']
#del df['popularity_yr']
#del df['popularity_ar']

In [81]:
df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,Minor,Major
0,31.0,0.0127,0.622,218293.0,0.89,0.95,0.124,-7.043,0.03,115.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,28.0,0.00306,0.62,215613.0,0.755,0.0118,0.534,-4.617,0.0345,127.994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,34.0,0.0254,0.774,166875.0,0.7,0.00253,0.157,-4.498,0.239,128.014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,32.0,0.00465,0.638,222369.0,0.587,0.909,0.157,-6.266,0.0413,145.036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,46.0,0.0289,0.572,214408.0,0.803,8e-06,0.106,-4.294,0.351,149.995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [88]:
# Separating columns containing features and output. Also, changing values in the output
# to labels 1 (popular) and 0 (not popular) using threshold arrived at from Billboard charts. 
X = df[df.columns.difference(['popularity'])]
# y = pd.cut(x=df["popularity"], bins=[-1,45,100], labels=[0,1])
y = df['popularity']
# print(y)
# Splitting data in to test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### Trial Random Forest

In [89]:
# Initializing and training the Random Forest Classifier with default hyperparameters. 
rnd_clf=RandomForestClassifier(n_jobs=-1) # n_jobs=-1 uses all cores of the CPU
rnd_clf.fit(X_train, y_train)

# Prediction on test data.
y_pred_rf=rnd_clf.predict(X_test)

print('\033[1mClassification Report')
print('\033[0m')
print(classification_report(y_test, y_pred_rf))

print('\033[1mConfusion Matrix')
print('\033[0m')
print(confusion_matrix(y_test, y_pred_rf))

[1mClassification Report
[0m
              precision    recall  f1-score   support

         0.0       0.66      0.81      0.72      2132
         1.0       0.50      0.53      0.51      2009
         2.0       0.50      0.32      0.39      2130
         3.0       0.62      0.66      0.64      1841

    accuracy                           0.58      8112
   macro avg       0.57      0.58      0.57      8112
weighted avg       0.57      0.58      0.57      8112

[1mConfusion Matrix
[0m
[[1723  389   15    5]
 [ 662 1070  269    8]
 [ 194  528  682  726]
 [  48  170  400 1223]]


### Extra Trees Classifier
We can also try to use the ExtraTreesClassifier which randomizes the number of features used to split each node. We see that the performance is slightly worse than using just a Random Forest. 

In [90]:
ext_clf=ExtraTreesClassifier(n_jobs=-1)
ext_clf.fit(X_train, y_train)
y_pred_ext=ext_clf.predict(X_test)

print('\033[1mClassification Report')
print('\033[0m')
print(classification_report(y_test, y_pred_ext))

print('\033[1mConfusion Matrix')
print('\033[0m')
print(confusion_matrix(y_test, y_pred_ext))

[1mClassification Report
[0m
              precision    recall  f1-score   support

         0.0       0.65      0.80      0.72      2132
         1.0       0.49      0.51      0.50      2009
         2.0       0.47      0.32      0.38      2130
         3.0       0.61      0.65      0.63      1841

    accuracy                           0.57      8112
   macro avg       0.56      0.57      0.56      8112
weighted avg       0.55      0.57      0.55      8112

[1mConfusion Matrix
[0m
[[1704  398   25    5]
 [ 676 1017  300   16]
 [ 202  505  683  740]
 [  46  165  430 1200]]


### Randomized Grid Search
Next step is to perform randomized grid search to find optimal hyperparameters. 

In [91]:
# Initializing a set of hyperparameters. Randomized Grid Search will then iterate through random combinations of these
# to find optimal parameters.

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)] # Number of Decisions Trees in the forest
max_features = ['auto', 'sqrt']                                                 # Max features used to split at each node
max_depth = np.linspace(20,100,5).tolist()                                      # Max depth of each tree
max_depth.append(None) 
bootstrap = [True, False]
min_samples_split = [2, 5, 10]                                                  # Minimum samples required to split at a node
min_samples_leaf = [1, 2, 4]                                                    # Minimum samples required for a leaf node

In [92]:
# Parameter Grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [93]:
# Running the Randomized Grid Search. 
rfc=RandomForestClassifier(random_state=42)
random_rfc = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, 
                                random_state=42, n_jobs = -1)
random_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=20.0, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   0.0s[CV] END bootstrap=False, max_depth=20.0, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=20.0, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=   0.0s[CV] END bootstrap=False, max_depth=60.0, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s[CV] END bootstrap=True, max_depth=100.0, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   0.0s

[CV] END bootstrap=False, max_depth=20.0, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=   0.0s[CV] END bootstrap=True, max_depth=100.0, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_e

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  22.5s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=  24.4s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=  24.6s




[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=  25.2s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  26.0s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  26.6s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  26.9s
[CV] END bootstrap=False, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False

  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   7.2s


  warn(


[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.5s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  22.8s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   0.0s


  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  22.3s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=  53.4s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   9.0s
[CV] END bootstrap=False, max_depth=100.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=False, max_depth=100.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=False, max_depth=100.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=False, max_depth=80.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=Fa

  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=60.0, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=60.0, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=60.0, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=40.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=40.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=40.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=40.0, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=F

  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=800; total time=  32.1s
[CV] END bootstrap=True, max_depth=40.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=True, max_depth=20.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=True, max_depth=20.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=True, m

  warn(


[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=800; total time=  31.8s


  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  21.5s


  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  22.5s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=60.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=True, max_depth=60.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=True, max_depth=60.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END bootstrap=False, 

  warn(


[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=  50.2s


  warn(


[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=  58.4s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  30.2s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=  58.7s


  warn(


[CV] END bootstrap=False, max_depth=80.0, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=False, max_depth=80.0, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=80.0, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  30.0s
[CV] END bootstrap=False

  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  30.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=  34.6s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=  34.6s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  15.7s
[CV] END bootstrap=False, max_depth=80.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=80.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=80.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_

  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=  23.5s


  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=  23.8s


  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=  24.2s
[CV] END bootstrap=False, max_depth=20.0, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=20.0, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=20.0, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True, max_depth=100.0, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=   0.0s
[CV] END bootstrap=True,

  warn(


[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  19.5s


  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=  59.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=True, max_depth=40.0, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   0.0s
[CV] END bootstrap=False, 

  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  53.6s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  54.7s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  32.5s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  50.6s


234 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
57 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/haochenyang/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/haochenyang/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/Users/haochenyang/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "/Users/haochenyang/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_v

In [94]:
print('\033[1mBest Parameters')
print('\033[0m')
random_rfc.best_params_

[1mBest Parameters
[0m


{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

### Final Run with Best Parameters

In [95]:
rnd_clf=RandomForestClassifier(n_estimators=800, max_depth=None, max_features='auto', criterion='gini', min_samples_leaf=4,
                               min_samples_split=10, bootstrap=True, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf=rnd_clf.predict(X_test)

  warn(


In [96]:
from joblib import dump
dump(rnd_clf, "Saved models/RFC")

FileNotFoundError: [Errno 2] No such file or directory: 'Saved models/RFC'

In [97]:
print('\033[1mClassification Report')
print('\033[0m')
print(classification_report(y_test, y_pred_rf))
print(y_pred_rf[0:20])

[1mClassification Report
[0m
              precision    recall  f1-score   support

         0.0       0.66      0.83      0.73      2132
         1.0       0.51      0.55      0.53      2009
         2.0       0.53      0.29      0.37      2130
         3.0       0.62      0.71      0.66      1841

    accuracy                           0.59      8112
   macro avg       0.58      0.59      0.57      8112
weighted avg       0.58      0.59      0.57      8112

[0. 1. 3. 3. 1. 0. 1. 3. 1. 3. 2. 2. 3. 0. 1. 1. 1. 0. 3. 3.]


In [98]:
from sklearn.metrics import roc_curve, roc_auc_score
# predict probabilities
lr_probs = rnd_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
lr_auc = roc_auc_score(y_test, lr_probs)


ValueError: multi_class must be in ('ovo', 'ovr')

In [99]:
from matplotlib import pyplot
from numpy import sqrt, argmax, save
import numpy as np
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, lr_probs)
np.save("fpr_RFC", fpr)
np.save("tpr_RFC", tpr)
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()

ValueError: multiclass format is not supported

In [100]:
from sklearn.metrics import auc, precision_recall_curve, f1_score
from matplotlib import pyplot
lr_precision, lr_recall, thresholds = precision_recall_curve(y_test, lr_probs)
np.save("lrp_RFC", lr_precision)
np.save("lrr_RFC", lr_recall)
lr_f1, lr_auc = f1_score(y_test, y_pred_rf), auc(lr_recall, lr_precision)
# summarize scores
print('SVM: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
fscore = (2 * lr_precision * lr_recall) / (lr_precision + lr_recall)
# plot the roc curve for the model
no_skill = len(y_test[y_test==1]) / len(y_test)
print(no_skill)
pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
# show the plot
pyplot.show()

ValueError: multiclass format is not supported