Implement a Random Forest model for classification on a complex dataset.

In [29]:
#importing the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [30]:
#loading the dataset
dataset = pd.read_csv("/Users/rakshit/Downloads/titanic.csv")
dataset.shape

(891, 12)

In [31]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [32]:
#removing the unnecessary columns 
data = dataset.drop(["PassengerId","Name","SibSp","Parch","Ticket","Cabin","Embarked"],axis=1)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [33]:
#label encoding the feature
label_encode = LabelEncoder()
data["Sex_n"] = label_encode.fit_transform(data["Sex"])

In [34]:
#dropping the old column
data_n = data.drop(columns="Sex",axis=1)

In [35]:
#labelling the data
x = data_n.drop(columns="Survived",axis=1)
y = data_n["Survived"]

In [36]:
#splitting the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=2)
print(x.shape,x_train.shape,x_test.shape)

(891, 4) (623, 4) (268, 4)


In [37]:
parameter = {
  'n_estimators': [50, 100, 150, 200], 
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [38]:
#training the model
from sklearn.model_selection import RandomizedSearchCV
model = RandomForestClassifier()
rcv = RandomizedSearchCV(model,cv=5,param_distributions=parameter,scoring="accuracy",random_state=42)
rcv.fit(x_train,y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'max_depth': [None, 5, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [50, 100, ...]}"
,n_iter,10
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,150
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [39]:
# Create a variable for the best model
best_rf = rcv.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rcv.best_params_)

Best hyperparameters: {'n_estimators': 150, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10}


In [40]:
#evaluating using cross valid score
cross_score = cross_val_score(rcv,x_train,y_train,cv=5)
print(np.average(cross_score)*100)

81.37677419354839


In [41]:
#predciton of testing data
y_pred = rcv.predict(x_test)

#accuracy score
x_test_accuracy = accuracy_score(y_pred,y_test)
print("Accuracy of test data: ",x_test_accuracy*100,"%")

Accuracy of test data:  80.22388059701493 %


In [42]:
#prediction of training data
y_pred_train = rcv.predict(x_train)

#accuracy score
x_train_accuracy = accuracy_score(y_pred_train,y_train)
print("Accuracy of train data: ",x_train_accuracy*100,"%")

Accuracy of train data:  90.69020866773675 %


In [43]:
#classification report 
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.91      0.79      0.85       185
           1       0.64      0.83      0.72        83

    accuracy                           0.80       268
   macro avg       0.78      0.81      0.78       268
weighted avg       0.83      0.80      0.81       268

