# Classification example
UCI Wine recognition dataset. Three-class classification.

More information about the data here: https://scikit-learn.org/stable/datasets/index.html#wine-dataset

In [46]:
# Import packages
import numpy as np
import sklearn.datasets
import sklearn.linear_model

# Reproducible results
np.random.seed(42)

data = sklearn.datasets.load_wine()

n_train = int(data.data.shape[0] * 0.75 + 0.5)
n_val = int(data.data.shape[0] * 0.15 + 0.5)
n_test = int(data.data.shape[0] * 0.10 + 0.5) - 1

X = data.data[:n_train, :]
y = data.target[:n_train]
X_val = data.data[n_train:n_train + n_val, :]
y_val = data.target[n_train:n_train + n_val]
# Note! Do not touch the test data until the very end!
X_test = data.data[n_train + n_val:, :]
y_test = data.target[n_train + n_val:]

print(f"Training set size X  : {X.shape}")
print(f"Training set size y  : {y.shape}")
print(f"Validation set size X: {X_val.shape}")
print(f"Validation set size y: {y_val.shape}")
print(f"Test set size X      : {X_test.shape}")
print(f"Test set size y      : {y_test.shape}")
print(f"Output classes       : {set(y)}")
print(f"Feature names        : {data.feature_names}")

Training set size X  : (134, 13)
Training set size y  : (134,)
Validation set size X: (27, 13)
Validation set size y: (27,)
Test set size X      : (17, 13)
Test set size y      : (17,)
Output classes       : {0, 1, 2}
Feature names        : ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [0]:
# Task1: Preprocess the data
#  - Try without preprovessing, try with different kinds.
#  - Evaluate on the validation data
standard_scaler = sklearn.preprocessing.StandardScaler()
standard_scaler.fit(X)
X_ = standard_scaler.transform(X)
X_val_ = standard_scaler.transform(X_val)
X_test_ = standard_scaler.transform(X_test)

In [0]:
# Fit baseline model
model_baseline = sklearn.linear_model.LogisticRegression(
    penalty="none",
    tol=0.0001,
    fit_intercept=True,
    solver="lbfgs",
    max_iter=100,
    multi_class="multinomial")
_ = model_baseline.fit(X_, y)

In [49]:
# Evaluate baseline model
yhat = model_baseline.predict(X_)
yhat_val = model_baseline.predict(X_val_)
acc = sklearn.metrics.accuracy_score(y, yhat)
acc_val = sklearn.metrics.accuracy_score(y_val, yhat_val)
print(f"Training data accuracy  : {acc}")
print(f"Validation data accuracy: {acc_val:.2f}")

Training data accuracy  : 1.0
Validation data accuracy: 0.59


The model does not make any errors on the training data, and a larger error on the validation data. What does this mean? Can we do anything about it?

In [0]:
# Task 2: Find a better model
#  - Try different classification methods
#  - Evaluate them on the validation data
#  - Beat the baseline model and select the best one you can find

model = "... add your code here!"

_ = model.fit(X_, y)

In [43]:
# Evaluate better model
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
acc = sklearn.metrics.accuracy_score(y, yhat)
acc_val = sklearn.metrics.accuracy_score(y_val, yhat_val)
print(f"Training data accuracy  : {acc}")
print(f"Validation data accuracy: {acc_val:.2f}")

Training data accuracy  : 1.0
Validation data accuracy: 0.26


In [0]:
# Task 3: Determine the importance of the input variables
# ... your code here

In [45]:
# Evaluate the final model on the test data.
# This is only ever done once, and as the last thing we do.
# Training another model after this, based on the performance on the test data
# leads to biased results.
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
yhat_test = model.predict(X_test_)
acc = sklearn.metrics.accuracy_score(y, yhat)
acc_val = sklearn.metrics.accuracy_score(y_val, yhat_val)
acc_test = sklearn.metrics.accuracy_score(y_test, yhat_test)
print(f"Training data accuracy  : {acc}")
print(f"Validation data accuracy: {acc_val}")
print(f"Test data accuracy      : {acc_test}")

Training data accuracy  : 1.0
Validation data accuracy: 0.25925925925925924
Test data accuracy      : 0.35294117647058826
