# Random Forest - Classification

# Import Libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [5]:
df['Gender'].value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [6]:
df['Gender'].replace({'Male':0,'Female':1}, inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   User ID          400 non-null    int64
 1   Gender           400 non-null    int64
 2   Age              400 non-null    int64
 3   EstimatedSalary  400 non-null    int64
 4   Purchased        400 non-null    int64
dtypes: int64(5)
memory usage: 15.8 KB


In [8]:
df.drop('User ID', axis = 1, inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Gender           400 non-null    int64
 1   Age              400 non-null    int64
 2   EstimatedSalary  400 non-null    int64
 3   Purchased        400 non-null    int64
dtypes: int64(4)
memory usage: 12.6 KB


In [10]:
x = df.drop('Purchased',axis = 1)
y = df['Purchased']

In [11]:
x

Unnamed: 0,Gender,Age,EstimatedSalary
0,0,19,19000
1,0,35,20000
2,1,26,43000
3,1,27,57000
4,0,19,76000
...,...,...,...
395,1,46,41000
396,0,51,23000
397,1,50,20000
398,0,36,33000


In [12]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

# Train Test Split

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1,stratify=y)

In [14]:
y_test.value_counts()

0    51
1    29
Name: Purchased, dtype: int64

# Train Model

In [15]:
rf_clf = RandomForestClassifier(random_state=10)
rf_clf.fit(x_train,y_train)

# Test Data accuracy

In [16]:
y_pred = rf_clf.predict(x_test)
y_pred[10:15]

array([0, 0, 1, 0, 0], dtype=int64)

In [17]:
y_test[10:15]

293    0
135    0
171    0
66     0
310    0
Name: Purchased, dtype: int64

In [18]:
test_accuracy = accuracy_score(y_test,y_pred)
print('Test data accuracy is:', test_accuracy)

Test data accuracy is: 0.875


# Train Data Accuracy

In [19]:
y_pred_train = rf_clf.predict(x_train)

In [20]:
train_Accuracy = accuracy_score(y_train,y_pred_train)
print('Training data accuracy is :', train_Accuracy)

Training data accuracy is : 0.996875


# Hyperparameter Tuning

## GridSearchCV

In [21]:
hyp = {'n_estimators' : np.arange(10,100),
       'max_depth' : np.arange(1,10),
       'min_samples_leaf' : np.arange(1,10),
       'min_samples_split' : np.arange(2,10),
       'criterion' : ['gini','entropy']}

rf_clf = RandomForestClassifier(random_state=10)
gscv_rf_clf = GridSearchCV(rf_clf, hyp, cv = 5)
gscv_rf_clf.fit(x_train,y_train)   # Taking time >> Please do run the Notebook

KeyboardInterrupt: 

# RandomizedSearchCV

In [22]:
hyp = {'n_estimators' : np.arange(10,100),
       'max_depth' : np.arange(1,10),
       'min_samples_leaf' : np.arange(1,10),
       'min_samples_split' : np.arange(2,10),
       'criterion' : ['gini','entropy']}

rf_clf = RandomForestClassifier(random_state=10)
rscv_rf_clf = RandomizedSearchCV(rf_clf, hyp, cv = 5)
rscv_rf_clf.fit(x_train,y_train)

In [23]:
rscv_rf_clf.best_params_

{'n_estimators': 83,
 'min_samples_split': 8,
 'min_samples_leaf': 5,
 'max_depth': 8,
 'criterion': 'entropy'}

# Random Forest - Regression

In [24]:
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,0,19,19000,0
1,0,35,20000,0
2,1,26,43000,0
3,1,27,57000,0
4,0,19,76000,0
...,...,...,...,...
395,1,46,41000,1
396,0,51,23000,1
397,1,50,20000,1
398,0,36,33000,0


In [25]:
x = df.drop('EstimatedSalary', axis = 1)
y = df['EstimatedSalary']

In [26]:
x

Unnamed: 0,Gender,Age,Purchased
0,0,19,0
1,0,35,0
2,1,26,0
3,1,27,0
4,0,19,0
...,...,...,...
395,1,46,1
396,0,51,1
397,1,50,1
398,0,36,0


In [27]:
y

0      19000
1      20000
2      43000
3      57000
4      76000
       ...  
395    41000
396    23000
397    20000
398    33000
399    36000
Name: EstimatedSalary, Length: 400, dtype: int64

In [28]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=1)

In [29]:
rf_reg = RandomForestRegressor(random_state=1)
rf_reg.fit(x_train,y_train)

# Test Data Accuracy

In [30]:
y_pred = rf_reg.predict(x_test)


In [31]:
r2_score = r2_score(y_test,y_pred)
r2_score

-0.18523702373579476

In [32]:
y_pred_train = rf_reg.predict(x_train)
y_pred_train[1:5]

array([61286.26190476, 73462.91269841, 30473.33333333, 76304.04761905])

In [33]:
y_train[1:5]

23      22000
299    117000
13      18000
90      81000
Name: EstimatedSalary, dtype: int64