# Business Problem
 1. Develop a predictive model to identify whether mushroom is good or poisonous based on various featutes of mushrooms
 2. A general guideline for featutes, people should look out for when picking mushrooms

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
df = pd.read_csv('penguins_size.csv')
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [14]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


# **Data Preprocessing**

In [16]:
df['sex'].replace(',', 'MALE', inplace=True)

In [17]:
df['culmen_length_mm'].fillna(df['culmen_length_mm'].mean(), inplace=True)
df['culmen_depth_mm'].fillna(df['culmen_depth_mm'].mean(), inplace=True)
df['flipper_length_mm'].fillna(df['flipper_length_mm'].mean(), inplace=True)
df['body_mass_g'].fillna(df['body_mass_g'].mean(), inplace=True)
df['sex'].fillna(df['sex'].mode()[0], inplace=True)

# X & y

In [18]:
X = pd.get_dummies(df.drop('species', axis=1), drop_first=True)
y = df['species']

**Train Test Split**

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6) 

# **Modelling & Evalution**

In [26]:
# Random forst classifier with defalult parameters
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=6)
model.fit(X_train, y_train)

# Preparation
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

# Evlution
from sklearn.metrics import accuracy_score
print('Train accuracy:', accuracy_score(ypred_train, y_train))
print('Test accuracy:', accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print('Cross Val Score:', cross_val_score(model,X,y,cv=5).mean())

Train accuracy: 1.0
Test accuracy: 1.0
Cross Val Score: 0.9912617220801364


### **Hyperparameter Tunning**

In [29]:
from sklearn.model_selection import GridSearchCV

# Model
estimator = RandomForestClassifier(random_state=0)

#Parameter (which you want to tune and identify the best)
param_grid = {'n_estimators':list(range(1,101))}

grid = GridSearchCV(estimator, param_grid, scoring='accuracy', cv=5)

grid.fit(X_train, y_train)

grid.best_params_

{'n_estimators': 33}

**Importance of each feature given by this model**

In [30]:
grid.best_estimator_.feature_importances_

array([0.32670713, 0.11430202, 0.29247093, 0.12487703, 0.08628496,
       0.04355706, 0.00482621, 0.00697467])

In [34]:
feats_imp = pd.DataFrame(data=grid.best_estimator_.feature_importances_,
                         index=X.columns,
                         columns=['Feature Importance'])

important_feats = feats_imp[feats_imp['Feature Importance'] > 0]

important_features_list = important_feats.index.to_list()


# **Final Random Forest Model**
**with best hyper parameters & with important features**

In [37]:
# Input with important features
X_imp = X[important_features_list]

# Train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_imp,y,test_size=0.2, random_state=6)

# Model with best hyperparameter
final_rf_model = RandomForestClassifier(n_estimators=12, random_state=0)
final_rf_model.fit(X_train, y_train)

ypred_train = final_rf_model.predict(X_train)
ypred_test = final_rf_model.predict(X_test)

# Evalution 
print('Train accuracy:', accuracy_score(ypred_train, y_train))
print('Test accuracy:', accuracy_score(ypred_test, y_test))
print('Cross Validation Score:', cross_val_score(final_rf_model, X_imp, y, cv=5).mean())

Train accuracy: 0.9963636363636363
Test accuracy: 1.0
Cross Validation Score: 0.9796248934356351
