# 7d) Classification
### Data preparation

In [1]:
# conda install -c conda-forge pandas_ml

In [2]:
import sklearn
import pandas as pandas
import numpy as np
import pandas_ml
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import warnings
#ignore the future warning and calc error if no sample from a label 
warnings.simplefilter(action='ignore')

#load data and merge both tables to one, separate quality label
redwinedata = pandas.read_csv('data/winequality-red.csv', sep =';')
whitewinedata = pandas.read_csv('data/winequality-white.csv', sep =';')
concat_data = redwinedata.append(whitewinedata, ignore_index=True)
y = (concat_data[['quality']])
y = y.values.ravel()
X = concat_data.drop('quality', axis=1)
# convert to nparray
X = X.values
# scaler = preprocessing.StandardScaler().fit(X_train)
# using scaler to be able to use it on testset with same settings with scaler.transform(testset)

### Splitting  and normalizing data

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Normalizing before or after splitting: https://goo.gl/c1GQpE
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
# before testing do scaler.transform(X_test)


### Remarks:     
Macro vs Micro Average in Scores:    
macro takes the mean of each labels individual score    
micro takes overall recall independant of labels. Take this because we are interested in the overall performance not label-wise    
Precision vs Recall in non-binary case:
https://datascience.stackexchange.com/questions/32032/precision-and-recall-if-not-binary/32034#32034


### Support Vector Machines

In [4]:
# set model, fit and predict
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Training-Set Results

In [5]:
y_pred_train = svm.predict(X_train)
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_train, y_pred_train))
print("\n"+"Classification Scores:")
print(classification_report(y_train, y_pred_train))

Confusion Matrix:
Predicted  3  4     5     6    7  8  9  __all__
Actual                                         
3          1  0     9    10    0  0  0       20
4          0  2    83    60    0  0  0      145
5          0  0   956   477    4  0  0     1437
6          0  0   362  1475   61  0  0     1898
7          0  0    25   505  194  0  0      724
8          0  0     0    92   33  0  0      125
9          0  0     0     2    1  0  0        3
__all__    1  2  1435  2621  293  0  0     4352

Classification Scores:
             precision    recall  f1-score   support

          3       1.00      0.05      0.10        20
          4       1.00      0.01      0.03       145
          5       0.67      0.67      0.67      1437
          6       0.56      0.78      0.65      1898
          7       0.66      0.27      0.38       724
          8       0.00      0.00      0.00       125
          9       0.00      0.00      0.00         3

avg / total       0.61      0.60      0.57      4352

#### Test-Set Results 

In [6]:
y_pred = svm.predict(scaler.transform(X_test))
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_test, y_pred))
print("\n"+"Classification Scores:")
print(classification_report(y_test, y_pred))
 

Confusion Matrix:
Predicted  3  4    5     6    7  8  9  __all__
Actual                                        
3          0  0    4     6    0  0  0       10
4          0  0   48    22    1  0  0       71
5          0  0  456   243    2  0  0      701
6          0  0  185   704   49  0  0      938
7          0  0   10   271   74  0  0      355
8          0  0    1    45   22  0  0       68
9          0  0    0     0    2  0  0        2
__all__    0  0  704  1291  150  0  0     2145

Classification Scores:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        10
          4       0.00      0.00      0.00        71
          5       0.65      0.65      0.65       701
          6       0.55      0.75      0.63       938
          7       0.49      0.21      0.29       355
          8       0.00      0.00      0.00        68
          9       0.00      0.00      0.00         2

avg / total       0.53      0.58      0.54      2145



### Multi-layer Perceptron

In [7]:
# set model, fit and predict
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='adam',hidden_layer_sizes=(100, 100), random_state=1, max_iter=1000, learning_rate='adaptive')
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

#### Training set results

In [8]:
y_pred_train = mlp.predict(X_train)
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_train, y_pred_train))
print("\n"+"Classification Scores:")
print(classification_report(y_train, y_pred_train))

Confusion Matrix:
Predicted   3   4     5     6    7   8  9  __all__
Actual                                            
3          11   1     4     4    0   0  0       20
4           0  66    51    25    3   0  0      145
5           0   7  1226   184   19   1  0     1437
6           0   6   243  1534  113   2  0     1898
7           0   1    35   173  506   9  0      724
8           0   0     5    27   37  56  0      125
9           0   0     0     1    2   0  0        3
__all__    11  81  1564  1948  680  68  0     4352

Classification Scores:
             precision    recall  f1-score   support

          3       1.00      0.55      0.71        20
          4       0.81      0.46      0.58       145
          5       0.78      0.85      0.82      1437
          6       0.79      0.81      0.80      1898
          7       0.74      0.70      0.72       724
          8       0.82      0.45      0.58       125
          9       0.00      0.00      0.00         3

avg / total       0.78

#### Test-Set Results

In [9]:
y_pred =  mlp.predict(scaler.transform(X_test))
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_test, y_pred))
print("\n"+"Classification Scores:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
Predicted  3   4    5    6    7   8  9  __all__
Actual                                         
3          0   3    5    2    0   0  0       10
4          2   6   47   15    1   0  0       71
5          0  18  472  199   12   0  0      701
6          0   5  217  594  117   5  0      938
7          0   2   19  161  166   7  0      355
8          0   3    2   26   30   7  0       68
9          0   0    0    1    1   0  0        2
__all__    2  37  762  998  327  19  0     2145

Classification Scores:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        10
          4       0.16      0.08      0.11        71
          5       0.62      0.67      0.65       701
          6       0.60      0.63      0.61       938
          7       0.51      0.47      0.49       355
          8       0.37      0.10      0.16        68
          9       0.00      0.00      0.00         2

avg / total       0.56      0.58      0.57      2145

### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
# set model, fit and predict
rfc = RandomForestClassifier(max_depth=10, random_state=0)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### Training-Set Results

In [11]:
y_pred_train = rfc.predict(X_train)
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_train, y_pred_train))
print("\n"+"Classification Scores:")
print(classification_report(y_train, y_pred_train))

Confusion Matrix:
Predicted   3   4     5     6    7   8  9  __all__
Actual                                            
3          11   0     4     5    0   0  0       20
4           0  51    54    37    3   0  0      145
5           0   0  1215   215    7   0  0     1437
6           0   0   156  1718   24   0  0     1898
7           0   0    14   201  508   1  0      724
8           0   0     0    47   28  50  0      125
9           0   0     0     2    1   0  0        3
__all__    11  51  1443  2225  571  51  0     4352

Classification Scores:
             precision    recall  f1-score   support

          3       1.00      0.55      0.71        20
          4       1.00      0.35      0.52       145
          5       0.84      0.85      0.84      1437
          6       0.77      0.91      0.83      1898
          7       0.89      0.70      0.78       724
          8       0.98      0.40      0.57       125
          9       0.00      0.00      0.00         3

avg / total       0.83

#### Test-Set Results

In [12]:
y_pred = rfc.predict(scaler.transform(X_test))
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_test, y_pred))
print("\n"+"Classification Scores:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
Predicted  3  4    5     6    7  8  9  __all__
Actual                                        
3          0  0    4     5    1  0  0       10
4          0  3   49    18    1  0  0       71
5          0  1  486   201   13  0  0      701
6          0  1  172   689   74  2  0      938
7          0  0   10   183  160  2  0      355
8          0  0    2    40   21  5  0       68
9          0  0    0     0    2  0  0        2
__all__    0  5  723  1136  272  9  0     2145

Classification Scores:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        10
          4       0.60      0.04      0.08        71
          5       0.67      0.69      0.68       701
          6       0.61      0.73      0.66       938
          7       0.59      0.45      0.51       355
          8       0.56      0.07      0.13        68
          9       0.00      0.00      0.00         2

avg / total       0.62      0.63      0.60      2145



### Comparison

As expected, the multilayer perceptron (mlp) and random forest have a very high fit on the training data. This fit can be increased by increasing the layer size (mlp) or maximum depth (random forrest).     

This nicely shows the effect of overfitting, because  there is only a positive effect on the training- but not on the test-set-fit.

Overall, the results on the test-sets were quite similar. 
Support Vector Machines (SVM) have the worst performance. I am assuming this is because the (normalized) data is very dense and it is therefor hard to make a "distance"-based classification.
Furthermore random forest outperforms the multi layer peceptron. It is possible that this is caused by the relatively small sample in regard to a neural net approach like MLP.

# 7e) Performance >8

Only the random forrest was able to correctly predict scores of 8 (or above) in the test sets.     
On one hand this is due to the fact that there are only very few samples with extreme values (see histogram below).    
Splitting this data in test and training sets aggravates this issue even more.    
Therefore one has to be very careful with conclusions about the quality of the algorithms. It seems like random forrest is most able to predict extreme values one should not draw a conclusion about the quality of the algorithm. Especially the training set predictions of 8 or above are a bad indicator because they reflect overfitting.

In [13]:
import plotly.plotly as py
import plotly.graph_objs as go
hist = y
data = [go.Histogram(x=hist)]
py.iplot(data, filename='basic histogram')

# 7f) Boosting

In [14]:
from sklearn.ensemble import VotingClassifier
clf1 = SVC()
clf2 = MLPClassifier(solver='adam',hidden_layer_sizes=(100, 100), random_state=1, max_iter=1000, learning_rate='adaptive')
clf3 = RandomForestClassifier(max_depth=10, random_state=0)
combinedClassifier = VotingClassifier(estimators=[('svm', clf1), ('mlp', clf2), ('rfc', clf3)], weights=[1,1,0], voting='hard', flatten_transform=True)
combinedClassifier.fit(X_train, y_train)

VotingClassifier(estimators=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)), ('mlp', MLPClassifier(activation='relu', alpha=0.0001...estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))],
         flatten_transform=True, n_jobs=1, voting='hard',
         weights=[1, 1, 0])

#### Training-Set Result

In [15]:
y_pred_train = combinedClassifier.predict(X_train)
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_train, y_pred_train))
print("\n"+"Classification Scores:")
print(classification_report(y_train, y_pred_train))


Confusion Matrix:
Predicted   3   4     5     6    7  8  9  __all__
Actual                                           
3          11   1     4     4    0  0  0       20
4           0  66    58    21    0  0  0      145
5           0   7  1259   168    3  0  0     1437
6           0   6   439  1417   36  0  0     1898
7           0   1    52   490  181  0  0      724
8           0   0     5    88   32  0  0      125
9           0   0     0     2    1  0  0        3
__all__    11  81  1817  2190  253  0  0     4352

Classification Scores:
             precision    recall  f1-score   support

          3       1.00      0.55      0.71        20
          4       0.81      0.46      0.58       145
          5       0.69      0.88      0.77      1437
          6       0.65      0.75      0.69      1898
          7       0.72      0.25      0.37       724
          8       0.00      0.00      0.00       125
          9       0.00      0.00      0.00         3

avg / total       0.66      0.67

#### Test-Set Result

In [16]:
y_pred = rfc.predict(scaler.transform(X_test))
print("Confusion Matrix:")
print(pandas_ml.ConfusionMatrix(y_test, y_pred))
print("\n"+"Classification Scores:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
Predicted  3  4    5     6    7  8  9  __all__
Actual                                        
3          0  0    4     5    1  0  0       10
4          0  3   49    18    1  0  0       71
5          0  1  486   201   13  0  0      701
6          0  1  172   689   74  2  0      938
7          0  0   10   183  160  2  0      355
8          0  0    2    40   21  5  0       68
9          0  0    0     0    2  0  0        2
__all__    0  5  723  1136  272  9  0     2145

Classification Scores:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        10
          4       0.60      0.04      0.08        71
          5       0.67      0.69      0.68       701
          6       0.61      0.73      0.66       938
          7       0.59      0.45      0.51       355
          8       0.56      0.07      0.13        68
          9       0.00      0.00      0.00         2

avg / total       0.62      0.63      0.60      2145



## Conclusion
Given that the classifications are quite similar, boosting doesn't improve the result above the output from the Random Forrest. Because of the algorithms chosen, only a 'hard' and not a confidence based boosting was possible. This could also have influenced the result negatively.    
Interestingly, if the weights are put only on SVM and MLP, the results reach the same level as the Random Forrest. It therefore can be agued that boosting works when combining those two but they are not able to outperform the approach with Random Forrest.