### Classification Project

In [None]:
# 1. Model Training
# 2.Best Model
# 3. Tunning the best Model

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
from xgboost import XGBClassifier
#Grid Search
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
#pipeline
from sklearn.pipeline import Pipeline
#metrics 
from sklearn.metrics import accuracy_score

In [23]:
#key = model name value = models object
models={"LogisticRegression":LogisticRegression(),
        "DecisionTreeClassifier":DecisionTreeClassifier(),
        "KNeighborsClassifier":KNeighborsClassifier(),
        "SVC":SVC(),
        "GaussianNB":GaussianNB(),
        "MultinomialNB":MultinomialNB(),
        "AdaBoostClassifier":AdaBoostClassifier(),
        "GradientBoostingClassifier":GradientBoostingClassifier(),
        "RandomForestClassifier":RandomForestClassifier(),
        "XGBClassifier":XGBClassifier()
       }

### Model Training

In [28]:
#model_training function is for training various models on the dataset
#@models -> dictionary 
#@xtrain -> Training Attributes
#@xtest -> Testing Attributes
#@ytrain -> Train class
#ytest-> test class
#scale_flag -> 0(no scaling),1(standard),2(minmax)
def model_training(models,xtrain,xtest,ytrain,ytest,scale_flag):
    acc_result={}
    for name,model in models.items():
        try:
            if (scale_flag==1):
                model_pipeline=Pipeline([('StandardScaler',StandardScaler()),('model',model)])
            elif(scale_flag==2):
                model_pipeline=Pipeline([('MinMaxScaler',MinMaxScaler()),('model',model)])
            else:
                model_pipeline=Pipeline([('model',model)])
            #Model Training and Prediction
            model_fit=model_pipeline.fit(xtrain,ytrain)
            ypred=model_fit.predict(xtest)
            acc=accuracy_score(ytest,ypred)
            print("Accuracy for {} is {}".format(name, acc))
            acc_result[name]=acc
        except:
            print("Something went wrong")
    return acc_result        
    

### Function for getting a best model

In [13]:
def bestModel(model_result):
    high=0
    for name,acc in model_result.items():
        if acc>high:
            high=acc
            model_name=name
    print("Best Model is ",model_name,"with Accuracy ",high)

### Function For Tuning the Hyper Parameters

In [32]:
#@model
#@params
#xtrain,ytrain
def bestParameters(model,params,xtrain,ytrain):
    cv=RepeatedStratifiedKFold(n_splits=5,n_repeats=3)
    grid_cv=GridSearchCV(estimator=model,param_grid=params,cv=cv,scoring='accuracy')
    res=grid_cv.fit(xtrain,ytrain)
    print("Best Parameters are :",res.best_params_)
    print("Best Accuracy is :",res.best_score_)

In [15]:
df=pd.read_csv("../Data/sonar.all-data.csv",header=None)

In [16]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [17]:
df[60]=df[60].map({'M':0,'R':1})

In [18]:
x=df.drop(columns=[60])
y=df[60]

In [21]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=0)

In [29]:
model_result=model_training(models,xtrain,xtest,ytrain,ytest,1)

Accuracy for LogisticRegression is 0.8809523809523809
Accuracy for DecisionTreeClassifier is 0.6428571428571429
Accuracy for KNeighborsClassifier is 0.8095238095238095
Accuracy for SVC is 0.8571428571428571
Accuracy for GaussianNB is 0.6666666666666666
Something went wrong
Accuracy for AdaBoostClassifier is 0.8809523809523809
Accuracy for GradientBoostingClassifier is 0.9047619047619048
Accuracy for RandomForestClassifier is 0.8809523809523809
Accuracy for XGBClassifier is 0.8809523809523809




In [30]:
bestModel(model_result)

Best Model is  GradientBoostingClassifier with Accuracy  0.9047619047619048


In [33]:
model=GradientBoostingClassifier() 
grid=dict(n_estimators=[10,50,100,1000])
bestParameters(model,grid,xtrain,ytrain)

Best Parameters are : {'n_estimators': 1000}
Best Accuracy is : 0.8073677956030897
