### Random forest classification with RandomizedSearchCV

RandomizedSearch is better option when magnitude of influence are imbalanced, which is more likely to happen as your number of parameters is growing. Now, the problem with GridSearchCV is it's really slow when compared to RandomizedSearchCV

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
#importing dataset
dataset = pd.read_csv("C:\\Users\\veena\\Desktop\\dataset's\\Advertising_data.csv")
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [3]:
dataset.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [4]:
X = dataset.iloc[:,[2,3]].values #independent features
y = dataset.iloc[:, 4].values #dependent features

In [5]:
#splitting the dataset in to training & test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [6]:
#feature scaling - as age and estimated salary are in different units, we are scaling it to same unit
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [7]:
# fitting Random Forest classification in to training set
# n_estimators - how many decision trees we are using

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state= 50) 
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=50)

In [8]:
from sklearn.model_selection import RandomizedSearchCV

In [9]:
from scipy.stats import randint

In [10]:
est = RandomForestClassifier(n_jobs=-1)
rf_p_dist = {'max_depth' : [3,5,10,None],
              'n_estimators' : [100,200,300,400,500],
               'max_features' : randint(1,3),
              'criterion' : ['gini', 'entropy'],
              'bootstrap' : [True, False],
            'min_samples_leaf' : randint(1,4)}


In [11]:
def hypertuning_rscv(est, p_distr, nbr_iter, X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions = p_distr, n_jobs = -1, n_iter = nbr_iter, cv=9)
    
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score
rf_parameters , rf_ht_score = hypertuning_rscv(est, rf_p_dist, 40, X, y)


In [12]:
#predicting the test set results
y_pred = classifier.predict(X_test)

In [13]:
#making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)


In [14]:
accuracy_score = accuracy_score(y_test, y_pred)

In [15]:
accuracy_score

0.92

In [16]:
rf_parameters

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 2,
 'min_samples_leaf': 1,
 'n_estimators': 200}

In [17]:
# from sklearn.model_selection import cross_val_score
# cross_val = cross_val_score(classifier,X,y,cv = 10, scoring='accuracy',).mean()
# cross_val

0.8800000000000001

In [18]:
classifier = RandomForestClassifier(bootstrap=True, criterion = 'entropy', max_depth= 3, max_features= 2, min_samples_leaf= 1,
 n_estimators= 400) 
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=3, max_features=2,
                       n_estimators=400)

In [19]:
#predicting the test set results
y_pred = classifier.predict(X_test)

In [20]:
#making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)

In [21]:
accuracy_score = accuracy_score(y_test, y_pred)

In [22]:
accuracy_score

0.94