In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = sns.load_dataset('tips')

We will be using tips dataset from seaborn as our data to compare MLPClassifier, RandomForestClassifier and GradientBoostingClassifier to know which one works best for us.

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


Before training our model we should make sure to do some feature engineering on our data. First we will convert the smoker's column to binary data instead of YES and NO.

In [4]:
df['smoker'] = df['smoker'].replace('No', 0)
df['smoker'] = df['smoker'].replace('Yes', 1)

Now, convert other text data into binary columns to create new features for our models.

In [5]:
new_df = pd.concat([df, pd.get_dummies(df['sex']), pd.get_dummies(df['day']), pd.get_dummies(df['time'])], 1)

In [6]:
new_df = new_df.drop(['sex', 'day', 'time'], 1)

In [7]:
X = new_df.drop('smoker', 1)
Y = new_df['smoker']

## MLPClassifier

In [13]:
mlp = MLPClassifier(hidden_layer_sizes=(500, 400, 300, 200, 100), max_iter=10000)
param_grid = {
    "hidden_layer_sizes": [(500, 400, 300, 200, 100), (50, 40, 30, 20, 10), (50, 40, 30), (5000, 4000, 1000)],
    "activation": ['identity', 'logistic', 'tanh', 'relu'],
    "learning_rate": ['constant', 'invscaling', 'adaptive']
}
CV_mlp = GridSearchCV(mlp, param_grid=param_grid, cv=3)
CV_mlp.fit(X, Y)
print("Best parameters: ", CV_mlp.best_params_)
print("Model score: ", CV_mlp.score(X, Y))



Best parameters:  {'activation': 'relu', 'hidden_layer_sizes': (5000, 4000, 1000), 'learning_rate': 'adaptive'}
Model score:  0.7131147540983607


## RandomForestClassifier

In [9]:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=4)
param_grid={
    'n_estimators' : [100, 1000, 10000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' : ['gini', 'entropy']
}
CV_rfc = GridSearchCV(rfc, param_grid=param_grid, cv=3)
CV_rfc.fit(X, Y)
print("Best parameters: ", CV_rfc.best_params_)
print("Model score: ", CV_rfc.score(X, Y))

Best parameters:  {'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100}
Model score:  0.7991803278688525




## GradientBoostingClassifier

In [10]:
gbc = GradientBoostingClassifier()
parameters = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "n_estimators":[10, 20, 100]
}
CV_gbc = GridSearchCV(gbc, param_grid=parameters, cv=3)
CV_gbc.fit(X, Y)
print("Best parameters: ", CV_gbc.best_params_)
print("Model score: ", CV_gbc.score(X, Y))

Best parameters:  {'criterion': 'friedman_mse', 'learning_rate': 0.05, 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 10}
Model score:  0.7295081967213115




## Conclusion

From above three models we can conclude that RandomForestClassifier is most suitable for selected data. But MLPClassifier works better as the size of data increases. As it gets more data to distribute through big hidden layers.