# Random Forest

## Import packages

In [1]:
#Import libraries
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score 

## Read the data

In [2]:
#Load the dataset
song_data = pd.read_csv('https://raw.githubusercontent.com/yashapatel131/KDD_GroupProject/main/Data/Song_Attributes.csv')
song_data.head()

Unnamed: 0,ID,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,popularityLabel,Speechiness,Tempo,TimeSignature,Valence
0,0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,Popular,0.0309,106.022,4,0.365
1,1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,Popular,0.0282,120.027,4,0.408
2,2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,Popular,0.0559,144.061,4,0.37
3,3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,Popular,0.0254,111.975,4,0.183
4,4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,Popular,0.0318,92.721,4,0.666


In [3]:
#Drop some columns
songAttributes = song_data.drop(columns=['ID', 'Album', 'Mode', 'Name', 'Artist', 'Popularity'])
songAttributes.head()

Unnamed: 0,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,popularityLabel,Speechiness,Tempo,TimeSignature,Valence
0,0.000728,0.52,234947,0.904,False,0.0103,0.0634,-5.03,Popular,0.0309,106.022,4,0.365
1,0.0182,0.581,239573,0.709,False,0.000664,0.174,-4.909,Popular,0.0282,120.027,4,0.408
2,0.000473,0.572,198400,0.918,False,0.000431,0.0977,-3.324,Popular,0.0559,144.061,4,0.37
3,0.00097,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,Popular,0.0254,111.975,4,0.183
4,3.6e-05,0.52,222520,0.808,False,1e-05,0.08,-4.553,Popular,0.0318,92.721,4,0.666


In [4]:
#Dummy encoding and view the new columns
songAttributes_binary_encoded = pd.get_dummies(songAttributes, columns=['TimeSignature', 'Explicit'])
songAttributes_binary_encoded.columns

Index(['Acousticness', 'Danceability', 'Duration', 'Energy',
       'Instrumentalness', 'Liveness', 'Loudness', 'popularityLabel',
       'Speechiness', 'Tempo', 'Valence', 'TimeSignature_0', 'TimeSignature_1',
       'TimeSignature_3', 'TimeSignature_4', 'TimeSignature_5',
       'Explicit_False', 'Explicit_True'],
      dtype='object')

## Splitting the data

In [5]:
X = songAttributes_binary_encoded.drop(['popularityLabel', 'TimeSignature_1', 'Explicit_False'], axis=1)
y = songAttributes_binary_encoded['popularityLabel']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 6162)
X_train.head()

Unnamed: 0,Acousticness,Danceability,Duration,Energy,Instrumentalness,Liveness,Loudness,Speechiness,Tempo,Valence,TimeSignature_0,TimeSignature_3,TimeSignature_4,TimeSignature_5,Explicit_True
48090,0.118,0.608,289173,0.312,2.9e-05,0.103,-8.711,0.0263,93.034,0.223,0,1,0,0,0
30590,0.0121,0.381,490960,0.965,0.0218,0.966,-3.676,0.125,136.505,0.328,0,0,1,0,0
36643,0.105,0.532,228587,0.466,7.7e-05,0.0737,-6.412,0.0279,159.956,0.43,0,1,0,0,0
137260,0.00304,0.582,567346,0.961,0.807,0.0859,-10.149,0.0381,135.995,0.828,0,0,1,0,0
147494,0.371,0.855,324189,0.694,0.0,0.109,-4.393,0.169,122.078,0.49,0,0,1,0,1


## Create and fit the model

In [7]:
forest = RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=50, oob_score=True, random_state=6162)
forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1, oob_score=True,
                       random_state=6162)

## Evaluation of model

In [8]:
prediction_labels = forest.predict(X_test)

In [9]:
results = confusion_matrix(y_test, prediction_labels)
print ('Confusion Matrix:')
print(results) 
print('Accuracy Score:',accuracy_score(y_test, prediction_labels))
print('OOB Score:', (forest.oob_score_)*100)
print('Report:')
print(classification_report(y_test, prediction_labels))

Confusion Matrix:
[[ 7958 12488]
 [ 5862 20172]]
Accuracy Score: 0.6052065404475043
OOB Score: 60.56560105485427
Report:
              precision    recall  f1-score   support

     Popular       0.58      0.39      0.46     20446
   Unpopular       0.62      0.77      0.69     26034

    accuracy                           0.61     46480
   macro avg       0.60      0.58      0.58     46480
weighted avg       0.60      0.61      0.59     46480

