# random forest

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("C:\\Users\\saisu\\OneDrive\\Documents\\penguins_size.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


# step-3: DATA PREPROCESSING

In [4]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [5]:
df["sex"].replace(".","MALE",inplace = True)

In [9]:
df["culmen_length_mm"].fillna(df["culmen_length_mm"].mean(), inplace=True)
df["culmen_depth_mm"].fillna(df["culmen_depth_mm"].mean(), inplace=True)
df["flipper_length_mm"].fillna(df["flipper_length_mm"].mean(), inplace=True)
df["body_mass_g"].fillna(df["body_mass_g"].mean(), inplace=True)
df["sex"].fillna(df["sex"].mode()[0] ,inplace=True)

**X&y**

In [10]:
X = pd.get_dummies(df.drop('species',axis=1),drop_first=True)
y = df['species']

**train test split**

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=6)

# STEP-4 : MODELLING & EVALUATION

In [1]:
#random forest classifer with default parameters
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=6)
model.fit(X_train,y_train)

#prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

#evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy:",accuracy_score(ypred_train,y_train))
print("Test accuracy:",accuracy_score(ypred_test,y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score:",cross_val_score(model,X,y,cv=5).mean())

NameError: name 'X_train' is not defined

**hyperparameter tuning**

In [14]:
from sklearn.model_selection import GridSearchCV

#model
estimator = RandomForestClassifier(random_state=0)

#parameters (which you want to tune and identify the best)

param_grid = {'n_estimators':list(range(1,100))}

grid = GridSearchCV(estimator,param_grid, scoring="accuracy",cv=5)

grid.fit(X_train,y_train)

grid.best_params_

{'n_estimators': 12}

**importance of each feature given by this model**

In [15]:
grid.best_estimator_.feature_importances_

array([0.35706512, 0.16032298, 0.22653459, 0.0667992 , 0.15122477,
       0.02600539, 0.01204796])

In [22]:
feats = pd.DataFrame(data=grid.best_estimator_.feature_importances_,
                    index=X.columns,
                    columns=['Feature Importance'])

feats_imp = feats[feats['Feature Importance']>0]
important_features_list = feats_imp.index.to_list()

# FINAL RANDOM FOREST MODEL

**with best hyper parameters & with important features**

In [24]:
#input with important features
X_imp = X[important_features_list]

#train-test-split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=6)

#model with best hyperparameters
final_rf_model = RandomForestClassifier(n_estimators=12,random_state=0)
final_rf_model.fit(X_train,y_train)

ypred_train = final_rf_model.predict(X_train)
ypred_test = final_rf_model.predict(X_test)

#evaluation
print("Train accuracy:",accuracy_score(ypred_train,y_train))
print("Test accuracy:",accuracy_score(ypred_test,y_test))
print("cross validation score:",cross_val_score(final_rf_model,X_imp,y,cv=5).mean())

Train accuracy: 0.9963636363636363
Test accuracy: 1.0
cross validation score: 0.9854646206308612
