# WIND DATA CLASSIFICATION

# 1 - IMPORTS FOR NOTEBOOK

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# 2 - LOAD WINE DATA

In [34]:
from sklearn.datasets import load_wine
wine_data = load_wine()

# 3 - CONVERT TO PANDAS DATAFRAME

### 3.2 - DATA OVERVIEW
<p>Attribute Information:</p>
<ol>
    <li><b>Alcohol</b></li>
    <li><b>Malic acid</b></li>
    <li><b>Ash</b></li>
    <li><b>Alcalinity of ash</b></li>
    <li><b>Magnesium</b></li>
    <li><b>Total phenols</b></li>
    <li><b>Flavanoids</b></li>
    <li><b>Nonflavanoid phenols</b></li>
    <li><b>Proanthocyanins</b></li>
    <li><b>Color intensity</b></li>
    <li><b>Hue</b></li>
    <li><b>OD280/OD315 of diluted wines</b></li>
    <li><b>Proline</b></li>
</ol>

In [35]:
pd_wine_data = pd.DataFrame(wine_data.data,columns=['ALCOHOL','MALIC-ACID','ASH','ACLALINITY-ASH',
                                                    'MAGNESIUM','TOT-PHENLOS','FLAVANOIDS','NON-FLAV-PHENOLS',
                                                    'PRO-ANTHO','COLOR-INTENSITY','HUE','DILUTE','PROLINE'])

In [36]:
wine_data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [37]:
pd_wine_target = pd.DataFrame(wine_data.target,columns=['target'])

In [38]:
pd_wine_target

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
173,2
174,2
175,2
176,2


# 4 TRAIN TEST SPLIT

In [39]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(pd_wine_data, pd_wine_target, test_size=0.33, random_state=2020)

In [40]:
unique, count = np.unique(test_y, return_counts=True)

In [41]:
class_count = len(unique)

# CLASSIFICATION METRICS

In [42]:
metrics_list={}

In [43]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def mertics_function(test_y, y_pred,class_count,model_name,metrics_list,comments):
    metrics_list['model_name'] = model_name
    metrics_list['accuracy_score'] = accuracy_score(test_y, y_pred)
    metrics_list['balanced_accuracy_score'] = balanced_accuracy_score(test_y, y_pred)
    #metrics_list['average_precision_score'] = average_precision_score(test_y, y_pred)
    #metrics_list['brier_score_loss'] = brier_score_loss(test_y, y_pred)
    #metrics_list['f1_score'] = f1_score(test_y, y_pred)
    #metrics_list['precision_score'] = precision_score(test_y, y_pred)
    #metrics_list['recall_score']=recall_score(test_y, y_pred)
    metrics_list['comments']=comments

# LOGISTIC REGRESSION

In [44]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"Logistic Regression",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = pd.DataFrame([metrics_list])

  y = column_or_1d(y, warn=True)


# SVM

In [45]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(train_x, train_y)
y_pred = svm_clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"SVM",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# SGD

In [46]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
sgd_clf.fit(train_x, train_y)
y_pred = sgd_clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"SGD",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# DECISION TREE

In [47]:
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = clf.fit(train_x, train_y)
y_pred = tree_clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"Decision Tree",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# RANDOM FOREST

In [48]:
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier(n_estimators=10)
rand_clf = clf.fit(train_x, train_y)
y_pred = rand_clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"Random Forest",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# GRADIENT DESCENT

In [49]:
from sklearn.ensemble import GradientBoostingClassifier
grad_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(train_x, train_y)
y_pred = grad_clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"Gradient Boost",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# ADA BOOST

In [50]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators=100).fit(train_x, train_y)
y_pred = ada_clf.predict(test_x)
mertics_function(test_y,y_pred,class_count,"Ada Boost",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


In [51]:
metrics_pd

Unnamed: 0,model_name,accuracy_score,balanced_accuracy_score,comments
0,Logistic Regression,0.983051,0.982456,Straight data without any change
0,SVM,0.491525,0.392231,Straight data without any change
0,SGD,0.644068,0.576923,Straight data without any change
0,Decision Tree,0.983051,0.982456,Straight data without any change
0,Random Forest,0.983051,0.982456,Straight data without any change
0,Gradient Boost,0.966102,0.964912,Straight data without any change
0,Ada Boost,0.932203,0.928282,Straight data without any change
