In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## 0. Data processing and visualization
load winequality-white.csv dataset and make exploratory data analysis

In [4]:
wine_quality = pd.read_csv("winequality-white.csv")

In [25]:
# TODO
data_values = []
for row in wine_quality.values:
    data_values.append(np.array(row[0].split(";"), dtype=np.float))
    
data_values = np.array(data_values)

### Create a normal pd.DataFrame from the wine_quality

In [26]:
# TODO
df_wine_quality = pd.DataFrame(data=data_values, columns=[col.replace("\"", "") for col in wine_quality.columns[0].split(";")])
df_wine_quality

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6.0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5.0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6.0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7.0


In [27]:
df_wine_quality.isnull().values.any()

False

### Choose the column "quality" as your target variabe.  Make its type integer.

In [28]:
# TODO
y = df_wine_quality.pop("quality").values.astype(np.int)
y

array([6, 6, 6, ..., 6, 7, 6])

### train_test_split your new DataFrame.

In [31]:
# TODO
X_train, X_test, y_train, y_test = train_test_split(df_wine_quality.values, y, test_size=0.25, shuffle=True, random_state=2021, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3673, 11), (3673,), (1225, 11), (1225,))

### 1. Decision trees classification
In this task we will try to predict wine quality based on its features by fitting a decision tree model. Fit a decision tree classifier by making a grid search over loss functions: 'giny', 'entropy' and over max_leaf_nodes parameter. Choose this parameters via 5-Fold cross-validation.

In [49]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# creating a KFold object with 5 splits 
# TODO
cv = KFold(n_splits=5)

# specify range of hyperparameters
# TODO
param_grid = {
    "criterion" : ["entropy", "gini"],
    "max_leaf_nodes" : np.arange(10, 200, 10)
}

# specify model
# TODO
tree = DecisionTreeClassifier()

# set up GridSearchCV()
# TODO
grid_search = GridSearchCV(tree, param_grid, scoring="accuracy", cv=cv, refit=True, verbose=3, n_jobs=-1)

# fit the cv model with training data
# TODO
search_result = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 38 candidates, totalling 190 fits


In [51]:
search_result.best_estimator_, search_result.best_score_, search_result.best_params_

(DecisionTreeClassifier(max_leaf_nodes=190),
 0.5417868727872619,
 {'criterion': 'gini', 'max_leaf_nodes': 190})

In [60]:
accuracy_score(y_test, search_result.best_estimator_.predict(X_test))

0.5730612244897959

### Choose the best parameters based on 'mean_test_score'.

In [62]:
# TODO
best_score_id = search_result.cv_results_["mean_test_score"].argmax()
best_params = search_result.cv_results_["params"][best_score_id]
best_params

{'criterion': 'gini', 'max_leaf_nodes': 190}

### Fit a DecisionTreeClassifier with the best parameters. Print f1_score and accuracy of the prediction.

In [63]:
# TODO
best_tree = DecisionTreeClassifier(**best_params)
best_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=190)

In [73]:
"F1 Score", f1_score(y_test, best_tree.predict(X_test), average="micro", labels=np.unique(y))

('F1 Score', 0.5730612244897959)

In [74]:
"Accuracy", accuracy_score(y_test, best_tree.predict(X_test))

('Accuracy', 0.5730612244897959)

### Visualize the best model's tree diagram

In [85]:
# Importing required packages for visualization
from IPython.display import Image  
# from sklearn.externals.six import StringIO
from six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
import graphviz

# Putting features
features = df_wine_quality.columns.difference(["quality"])
features

Index(['alcohol', 'chlorides', 'citric acid', 'density', 'fixed acidity',
       'free sulfur dioxide', 'pH', 'residual sugar', 'sulphates',
       'total sulfur dioxide', 'volatile acidity'],
      dtype='object')

In [88]:
# plotting the tree
dot_data = StringIO()  
export_graphviz(best_tree, out_file=dot_data, feature_names=features, filled=True, rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.set_size("25,15!") 
# graph.write_png('tree.png')
Image(graph.create_png())

InvocationException: Program terminated with status: -9. stderr follows: []

### 2. Comparison classification
Try to predict wine quality with other classification tools that we studied(Logistic Regression, SVM). Perform RandomizedSearchCV for each of them with desired hyperparameters and their ranges, choose the best model for each of them. Compare the accuracies and f-scores of all models on the test set and choose the best performing algorithm. 

In [None]:
# TODO

## 3. Decision trees regression
In this task we will use all the columns to predict alcohol concentration of a wine. Use the directives in task 1 as a guide to fit a Decision tree regressor.

In [26]:
X = df[df.columns.difference(['alcohol'])].copy()
y = df['alcohol'].copy()

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# creating a KFold object with 5 splits 
# TODO

# specify range of hyperparameters
# TODO

# specify model
# TODO

# set up GridSearchCV()
# TODO

# fit the cv model with training data
# TODO

In [15]:
best_score, best_score_id = model_cv.cv_results_['mean_test_score'].max(),model_cv.cv_results_['mean_test_score'].argmax()
best_hyperparams = model_cv.cv_results_['params'][best_score_id]

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

In [30]:
model = DecisionTreeRegressor(# best_hyperparams)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [31]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R^2: ', r2_score(y_test, y_pred))

MAE:  0.3635662666326438
MSE:  0.2406038233162766
RMSE:  0.4905138360090127
R^2:  0.847212724398044


## 4. Comparison regression
Predict wine alcohol concentration with Linear Regression. Compare mean absolute errors and rooted mean squared errors. What is the best model?

In [18]:
# TODO


In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print('MAE: ', mean_absolute_error(y_test, lReg2_pred))
print('MSE: ', mean_squared_error(y_test, lReg2_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, lReg2_pred)))
print('R^2: ', r2_score(y_test, lReg2_pred))