In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
wine=pd.read_csv('wine.csv')

In [None]:
wine.head()

In [None]:
wine.class_name.unique()

In [None]:
X=wine.drop(['class_label', 'class_name'], axis=1)
y=wine.class_label
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.3, random_state=21, stratify=y)

In [None]:
X_train.head()

In [None]:
# Plotting different Wine Classes
plt.scatter(X_train.iloc[:, 0], X_train.iloc[:, 6], c=y_train)
plt.grid(True)
plt.show()

##  Naive Bayes
### It is the most straightforward and fast classification algorithm
* Statistical classification technique based on Bayes Theorem
* Naive Bayes classifiers have high accuracy and speed on large datasets
* Naive Bayes classifier assumes that the effect of a particular feature in a class is independent of other features.That is, the presence of a particular feature in a class is unrelated to the presence of any other feature. That's why it is called Naive

* It performs well in case of discrete response variable compared to the continuous variable.
* If there is no training sample of a particular class, this causes zero posterior probability
* It perform well in case of categorical input variables compared to numerical variable(s). 
* For numerical variable, normal distribution is assumed (bell curve, which is a strong assumption)

__P(c|x)= P(x|c)*P(c) / P(x)__

In [None]:
(2/9)*(9/14)/(5/14)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
nb=GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

__Things you can try:__
* What would be accuracy if we do the PCA?
* How many PCA components give the best accuracy

In [None]:
logreg=LogisticRegression()
logreg.fit(X_train, y_train).score(X_test, y_test)

In [None]:
dt=DecisionTreeClassifier()
dt.fit(X_train, y_train).score(X_test, y_test)

__Naive Bayes won't be great in all the cases__

In [None]:
ti=pd.read_csv('titanic_train.csv')
ti=ti.drop(['Name', 'Ticket', 'Unnamed: 0', 'PassengerId'], axis=1)
X=ti.drop('Survived', axis=1)
y=ti.Survived
ti.Cabin=ti.Cabin.str[0]
X.Embarked=X.Embarked.fillna(X.Embarked.mean())
X.Child=X.Child.fillna(-1)
X.Cabin=X.Cabin.fillna('Z')
X.Cabin=pd.Categorical(X.Cabin)
X.Cabin=pd.Categorical(X.Cabin)
X.Pclass=pd.Categorical(X.Pclass)
X.Sex=pd.Categorical(X.Sex)
X.Embarked=pd.Categorical(X.Embarked)
X.Child=pd.Categorical(X.Child)
X=pd.get_dummies(X)
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=29, stratify=y)

In [None]:
nb=GaussianNB()
nb.fit(X_train, y_train).score(X_test, y_test)

In [None]:
logreg=LogisticRegression()
logreg.fit(X_train, y_train).score(X_test, y_test)

In [None]:
dt=DecisionTreeClassifier()
dt.fit(X_train, y_train).score(X_test, y_test)

### Metrics to take care of Class Imbalance: 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
cm = confusion_matrix(y_test, nb.predict(X_test))
cm

In [None]:
y_test.value_counts()

__Confusion Matrix__

<img src="Confusion Matrix.PNG" width="450" align='left'>

In [None]:
cr = classification_report(y_test, nb.predict(X_test))
print(cr)

In [None]:
162/(162+90)

### __Accuracy = ( tp + tn) /(tp + tn + fp + fn)__

### __Precision = tp / (tp + fp) or tn /(tn + fn)__

### __Recall/ hit rate = tp / (tp + fn)  or tn /(tn + fp)__

### __F1 score /Harmonic mean = 2*Precision * Recall / (Precision + Recall)__


## Support Vector Machine

#### Only support vectors affect fit

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train).score(X_test, y_test)

#### Transformations through kernels

In [None]:
svm = SVC(kernel='rbf', gamma=1)
svm.fit(X_train, y_train).score(X_test, y_test)

__How do we know which combination of Hyperparameters is right for us ?__

## K Nearest Neighbours

<img src="KNN.png" width="450" align='left'>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)
knn.fit(X_train,  y_train).score(X_test, y_test)

## Hyper Parameter Tuning

### __Cross Validation : A better measure for splitting data into train and test__
* Takes more computation time

__Problems with Train Test Split__

__Testing Accuracy with different splits and random_states:__

In [None]:
split_list = [0.1, 0.2, 0.3, 0.4, 0.5]
Accuracy_list=[]

for split in split_list:
    X_train, X_test, y_train, y_test=train_test_split(X,y, test_size= split, random_state=21, stratify=y)
    knn=KNeighborsClassifier(10)
    knn.fit(X_train,  y_train)
    Accuracy_list.append(knn.score(X_test,y_test))
    
plt.figure(figsize=(10,3))
plt.plot(split_list, Accuracy_list)
plt.grid(True)
plt.show()

In [None]:
random_list = [1, 2, 3, 4, 5 ]
Accuracy_list=[]

for random in random_list:
    X_train, X_test, y_train, y_test=train_test_split(X,y, test_size= 0.4, random_state = random, stratify=y)
    knn=KNeighborsClassifier(10)
    knn.fit(X_train,  y_train)
    Accuracy_list.append(knn.score(X_test,y_test))
    
plt.figure(figsize=(10,3))
plt.plot(random_list, Accuracy_list)
plt.grid(True)
plt.show()

__5 Fold Cross Validation__

<img src="Cross_Validation.PNG" width="250" align='left'>

In [None]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(knn, X, y, cv=5)
scores

In [None]:
exp_score=np.mean(scores)
exp_score

In [None]:
exp_dev=np.std(scores)
exp_dev

__Using Cross Validation for Hyper Parameter Tuning__

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
svm = SVC(kernel='rbf')
# Radial Basis Function
parameters = {'gamma':[0.001, 0.01, 0.1, 10, 100]}

In [None]:
%%time
searcher = GridSearchCV(svm, parameters, cv=3)
searcher.fit(X,y)
# Report the best parameters
print('Best Parameter: ', searcher.best_params_)
print('Best Score: ', searcher.best_score_)

In [None]:
%%time
svm = SVC()
parameters = {'gamma':[0.001, 0.01, 0.1, 10, 100], 'C':[0.001, 0.01, 0.1, 10, 100] }
searcher = RandomizedSearchCV(svm, parameters, cv=3)
searcher.fit(X,y)
# Report the best parameters
print('Best Parameter: ', searcher.best_params_)
print('Best Score: ', searcher.best_score_)

In [None]:
%%time
svm = SVC()
parameters = {'gamma':[0.001, 0.01, 0.1, 10, 100], 'C':[0.001, 0.01, 0.1, 10, 100] }
searcher = GridSearchCV(svm, parameters, cv=3)
searcher.fit(X,y)
# Report the best parameters
print('Best Parameter: ', searcher.best_params_)
print('Best Score: ', searcher.best_score_)

__Try this at home__

In [None]:
%%time
svm = SVC()
parameters = {'gamma':[0.001, 0.01, 0.1, 10, 100], 'kernel':['rbf', 'poly', 'sigmoid'] }
searcher = RandomizedSearchCV(svm, parameters, cv=3)
searcher.fit(X,y)
# Report the best parameters
print('Best Parameter: ', searcher.best_params_)
print('Best Score: ', searcher.best_score_)

---

# Classification Models: which one to choose?

### Depends on the Situation:
   * Accuracy
   * Computational time - time availability
   * Gravity of the problem being solved
   * Understanding of the 'black box'
    
## My general observations:

### Linearly seperable data:
   * Logistic: Binary 
   * SVM: Multiclass
   * Naive: Speed is Critical
    
### Non-linearly seperable data:
   * Decision trees / Random Forest: Suitable for Large Datasets
   * SVM with kernels : 
        * Faster than Logistic kernels. 
        * Suitable for small simpler datasets. 
        * Slower than Decision trees for large datasets
   * Naive Bayes: Speed is Critical