# Choosing Best Model
## Grid Search CV

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

df = sns.load_dataset("titanic")
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = df['survived']
X = pd.get_dummies(X, columns=['sex'])
X.age.fillna(value=X['age'].mean(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = {
    "kNN": (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}),
    "Decision Tree": (DecisionTreeClassifier(), {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [3, 5, None], 'min_samples_split': [2, 5, 10]}),
    "Random Forest": (RandomForestClassifier(), {'n_estimators': [100, 200, 300], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, None], 'min_samples_split': [2, 5, 10]}),
    "SVM": (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto']}),
    "Logistic Regression": (LogisticRegression(), {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}),
    "Naive Bayes": (GaussianNB(), {})
}

results = {}
best_params = {}

for name, (clf, params) in classifiers.items():
    grid_search = GridSearchCV(clf, params, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)
    results[name] = grid_search.best_score_
    best_params[name] = grid_search.best_params_


best_model = max(results, key=results.get)
print(f'Best Model: {best_model}')
print(f'Best Parameters: {best_params[best_model]}')


### Step by Step

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

df = sns.load_dataset("titanic")
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = df['survived']
X = pd.get_dummies(X, columns=['sex'])
X.age.fillna(value=X['age'].mean(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### knn

In [3]:
model= KNeighborsClassifier()
grid_params= { 
                'n_neighbors': np.arange(1, 30, 2),
                'weights': ['uniform', 'distance'], 
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }   

grid_search = GridSearchCV(model, grid_params, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

ypred= grid_search.predict(X_test)

print(':Best parameters', grid_search.best_params_)
print('Accuracy score on Training Data', grid_search.best_score_)
print('Accuracy score on Testing Data',  accuracy_score(y_test, ypred))

:Best parameters {'algorithm': 'auto', 'n_neighbors': 23, 'weights': 'distance'}
Accuracy score on Training Data 0.7261400571259726
Accuracy score on Testing Data 0.7374301675977654


### Decision Tree

In [4]:
 
model= DecisionTreeClassifier()
grid_params= { 
                'criterion': ['gini', 'entropy'], 
                'splitter': ['best', 'random'], 
                'max_depth': [3,5, None], 
                'min_samples_split': [2, 5, 10]
            }   

grid_search = GridSearchCV(model, grid_params, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

ypred= grid_search.predict(X_test)

print(':Best parameters', grid_search.best_params_)
print('Accuracy score on Training Data', grid_search.best_score_)
print('Accuracy score on Testing Data',  accuracy_score(y_test, ypred))

:Best parameters {'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'best'}
Accuracy score on Training Data 0.8230079779375554
Accuracy score on Testing Data 0.7988826815642458


### Random Forest

In [5]:
 
model= RandomForestClassifier()
grid_params= { 
                'n_estimators': [100, 200, 300], 
                'criterion': ['gini', 'entropy'], 
                'max_depth': [3, 5, None], 
                'min_samples_split': [2, 5, 10]
            }   

grid_search = GridSearchCV(model, grid_params, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

ypred= grid_search.predict(X_test)

print(':Best parameters', grid_search.best_params_)
print('Accuracy score on Training Data', grid_search.best_score_)
print('Accuracy score on Testing Data',  accuracy_score(y_test, ypred))

:Best parameters {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy score on Training Data 0.8314291342460358
Accuracy score on Testing Data 0.7988826815642458


### SVM

In [12]:
 
model= SVC()
grid_params= { 
                'C': [0.1, 1, 10], 
                'kernel': [  'linear','poly','rbf', 'sigmoid'], 
                'gamma': ['scale', 'auto']
            }   

grid_search = GridSearchCV(model, grid_params, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

ypred= grid_search.predict(X_test)

print(':Best parameters', grid_search.best_params_)
print('Accuracy score on Training Data', grid_search.best_score_)
print('Accuracy score on Testing Data',  accuracy_score(y_test, ypred))

### Logistic Regression

In [None]:
 
model= LogisticRegression()
grid_params= { 
                'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
                'C': [0.1, 1, 10], 
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
            }   

grid_search = GridSearchCV(model, grid_params, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

ypred= grid_search.predict(X_test)

print(':Best parameters', grid_search.best_params_)
print('Accuracy score on Training Data', grid_search.best_score_)
print('Accuracy score on Testing Data',  accuracy_score(y_test, ypred))

### Naive Bayes

In [None]:
 
model= GaussianNB()
grid_params= {
            }   

grid_search = GridSearchCV(model, grid_params, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

ypred= grid_search.predict(X_test)

print(':Best parameters', grid_search.best_params_)
print('Accuracy score on Training Data', grid_search.best_score_)
print('Accuracy score on Testing Data',  accuracy_score(y_test, ypred))