In [1]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.utils import resample
import pandas as pd
import numpy as np
import re

In [2]:
TRAINING_FILE = "TrainingDataset.csv"
VALIDATION_FILE = "ValidationDataset.csv"

In [3]:
training = pd.read_csv(TRAINING_FILE)
validation = pd.read_csv(VALIDATION_FILE)

In [4]:
print(training.head())
print(validation.head())

  ""fixed acidity"";""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality""
0  8.9;0.22;0.48;1.8;0.077;29;60;0.9968;3.39;0.53...                                                                                                                                               
1  7.6;0.39;0.31;2.3;0.082;23;71;0.9982;3.52;0.65...                                                                                                                                               
2  7.9;0.43;0.21;1.6;0.106;10;37;0.9966;3.17;0.91...                                                                                                                                               
3  8.5;0.49;0.11;2.3;0.084;9;67;0.9968;3.17;0.53;...                                                                                                                                               
4  6.9;0.4;0.14;2.4;

In [5]:
print(F"Training Columns Before: {training.columns[0]}")
training_columns = training.columns[0].split(";")
training_columns = [re.sub("[^a-z\s]", "", column) for column in training_columns]
print(F"Training Columns After: {training_columns}")

Training Columns Before: ""fixed acidity"";""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality""
Training Columns After: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'p', 'sulphates', 'alcohol', 'quality']


In [6]:
print(F"Validation Columns Before: {validation.columns[0]}")
validation_columns = validation.columns[0].split(";")
validation_columns = [re.sub("[^a-z]", "", column) for column in training_columns]
print(F"Validation Columns After: {validation_columns}")

Validation Columns Before: ""fixed acidity"";""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality""
Validation Columns After: ['fixedacidity', 'volatileacidity', 'citricacid', 'residualsugar', 'chlorides', 'freesulfurdioxide', 'totalsulfurdioxide', 'density', 'p', 'sulphates', 'alcohol', 'quality']


In [7]:
corrected_training = pd.DataFrame(columns=training_columns, data=training[training.columns[0]].str.split(";", expand=True).values)
corrected_validation = pd.DataFrame(columns=validation_columns, data=validation[validation.columns[0]].str.split(";", expand=True).values)

for column in training_columns:
    corrected_training[column] = corrected_training[column].astype(float)

for column in validation_columns:
    corrected_validation[column] = corrected_validation[column].astype(float)

print(corrected_training.head())
print(corrected_validation.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            8.9              0.22         0.48             1.8      0.077   
1            7.6              0.39         0.31             2.3      0.082   
2            7.9              0.43         0.21             1.6      0.106   
3            8.5              0.49         0.11             2.3      0.084   
4            6.9              0.40         0.14             2.4      0.085   

   free sulfur dioxide  total sulfur dioxide  density     p  sulphates  \
0                 29.0                  60.0   0.9968  3.39       0.53   
1                 23.0                  71.0   0.9982  3.52       0.65   
2                 10.0                  37.0   0.9966  3.17       0.91   
3                  9.0                  67.0   0.9968  3.17       0.53   
4                 21.0                  40.0   0.9968  3.43       0.63   

   alcohol  quality  
0      9.4      6.0  
1      9.7      5.0  
2      9.5      5.0 

In [8]:
print("Training")
for column in corrected_training.columns:
    print(corrected_training[column].dtype)
print()
print("Validation")
for column in corrected_validation.columns:
    print(corrected_validation[column].dtype)

Training
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64

Validation
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64


In [9]:
print(corrected_training["quality"].value_counts())
print(len(corrected_training.index))

5.0    529
6.0    522
7.0    161
4.0     45
8.0     13
3.0      9
Name: quality, dtype: int64
1279


In [10]:
print(corrected_validation["quality"].value_counts())
print(len(corrected_validation.index))

5.0    66
6.0    65
7.0    22
8.0     4
4.0     2
3.0     1
Name: quality, dtype: int64
160


In [11]:
training_data_columns = training_columns.copy()
training_data_columns.remove("quality")


pt = pd.pivot_table(corrected_training, index=["quality"], values=training_data_columns, aggfunc={column: [np.mean, np.std] for column in training_data_columns})
print(pt)
pt.to_excel("Wine Stats.xlsx")

           alcohol           chlorides           citric acid            \
              mean       std      mean       std        mean       std   
quality                                                                  
3.0      10.127778  0.645712  0.113889  0.064048    0.135556  0.237808   
4.0      10.230000  0.953129  0.092178  0.082362    0.178000  0.208311   
5.0       9.931569  0.739033  0.092060  0.054869    0.231002  0.174456   
6.0      10.681865  1.034266  0.083153  0.036769    0.257375  0.196234   
7.0      11.493271  0.944743  0.077578  0.030562    0.355280  0.189509   
8.0      12.084615  1.246225  0.069769  0.011315    0.394615  0.197974   

          density           fixed acidity            ...         p            \
             mean       std          mean       std  ...      mean       std   
quality                                              ...                       
3.0      0.997249  0.001997      8.133333  1.717556  ...  3.424444  0.124410   
4.0      0.99

In [12]:
training_data = corrected_training.values[:, 0:len(training_columns) - 1]
print(training_data[0:3])
training_labels = corrected_training.values[:, len(training_columns) - 1]
print(training_labels[0:3])

[[ 8.9     0.22    0.48    1.8     0.077  29.     60.      0.9968  3.39
   0.53    9.4   ]
 [ 7.6     0.39    0.31    2.3     0.082  23.     71.      0.9982  3.52
   0.65    9.7   ]
 [ 7.9     0.43    0.21    1.6     0.106  10.     37.      0.9966  3.17
   0.91    9.5   ]]
[6. 5. 5.]


In [13]:
validation_data = corrected_validation.values[:, 0:len(validation_columns) - 1]
print(validation_data[0:3])
validation_labels = corrected_validation.values[:, len(validation_columns) - 1]
print(validation_labels[0:3])

[[7.400e+00 7.000e-01 0.000e+00 1.900e+00 7.600e-02 1.100e+01 3.400e+01
  9.978e-01 3.510e+00 5.600e-01 9.400e+00]
 [7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
  9.968e-01 3.200e+00 6.800e-01 9.800e+00]
 [7.800e+00 7.600e-01 4.000e-02 2.300e+00 9.200e-02 1.500e+01 5.400e+01
  9.970e-01 3.260e+00 6.500e-01 9.800e+00]]
[5. 5. 5.]


Normalization

In [14]:
scaler = StandardScaler()
normalized_training = scaler.fit_transform(training_data)

In [15]:
rfc = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets 
rfc.fit(normalized_training, training_labels)

#Perform prediction on the test set
y_pred = rfc.predict(scaler.transform(validation_data))

print("Classification Report:", classification_report(validation_labels, y_pred))
print("Confusion Metrics :", confusion_matrix(validation_labels, y_pred))
print('F1 Score: ', f1_score(validation_labels, y_pred, average='micro'))
print('Precision Score:', precision_score(validation_labels, y_pred, average="micro"))
print("Accuracy: ", accuracy_score(validation_labels, y_pred))

Classification Report:               precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         2
         5.0       0.63      0.71      0.67        66
         6.0       0.51      0.48      0.49        65
         7.0       0.26      0.27      0.27        22
         8.0       0.00      0.00      0.00         4

    accuracy                           0.53       160
   macro avg       0.23      0.24      0.24       160
weighted avg       0.50      0.53      0.51       160

Confusion Metrics : [[ 0  0  0  1  0  0]
 [ 0  0  2  0  0  0]
 [ 0  0 47 15  4  0]
 [ 0  0 22 31 11  1]
 [ 0  0  4 12  6  0]
 [ 0  0  0  2  2  0]]
F1 Score:  0.525
Precision Score: 0.525
Accuracy:  0.525


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Resampling

In [16]:
quality_3_resample = resample(corrected_training.loc[corrected_training["quality"] == 3], replace=True, n_samples=50)
quality_4_resample = resample(corrected_training.loc[corrected_training["quality"] == 4], replace=True, n_samples=150)
quality_5_resample = resample(corrected_training.loc[corrected_training["quality"] == 5], replace=False, n_samples=350)
quality_6_resample = resample(corrected_training.loc[corrected_training["quality"] == 6], replace=False, n_samples=350)
quality_7_resample = resample(corrected_training.loc[corrected_training["quality"] == 7], replace=True, n_samples=150)
quality_8_resample = resample(corrected_training.loc[corrected_training["quality"] == 8], replace=True, n_samples=50)

resampled_training = pd.concat([quality_3_resample, quality_4_resample, quality_5_resample, quality_6_resample, quality_7_resample, quality_8_resample])

resampled_training.to_excel("Uneven Resampling.xlsx")

In [None]:
quality_3_resample = resample(corrected_training.loc[corrected_training["quality"] == 3], replace=True, n_samples=200)
quality_4_resample = resample(corrected_training.loc[corrected_training["quality"] == 4], replace=True, n_samples=200)
quality_5_resample = resample(corrected_training.loc[corrected_training["quality"] == 5], replace=False, n_samples=200)
quality_6_resample = resample(corrected_training.loc[corrected_training["quality"] == 6], replace=False, n_samples=200)
quality_7_resample = resample(corrected_training.loc[corrected_training["quality"] == 7], replace=True, n_samples=200)
quality_8_resample = resample(corrected_training.loc[corrected_training["quality"] == 8], replace=True, n_samples=200)

resampled_training = pd.concat([quality_3_resample, quality_4_resample, quality_5_resample, quality_6_resample, quality_7_resample, quality_8_resample])

resampled_training.to_excel("Even Resampling.xlsx")

In [17]:
resampled_data = resampled_training.values[:, 0:len(training_columns) - 1]
print(resampled_data[0:3])
resampled_labels = resampled_training.values[:, len(training_columns) - 1]
print(resampled_labels[0:3])

[[7.1000e+00 8.7500e-01 5.0000e-02 5.7000e+00 8.2000e-02 3.0000e+00
  1.4000e+01 9.9808e-01 3.4000e+00 5.2000e-01 1.0200e+01]
 [6.7000e+00 7.6000e-01 2.0000e-02 1.8000e+00 7.8000e-02 6.0000e+00
  1.2000e+01 9.9600e-01 3.5500e+00 6.3000e-01 9.9500e+00]
 [6.8000e+00 8.1500e-01 0.0000e+00 1.2000e+00 2.6700e-01 1.6000e+01
  2.9000e+01 9.9471e-01 3.3200e+00 5.1000e-01 9.8000e+00]]
[3. 3. 3.]


In [18]:
scaler = StandardScaler()
normalized_training = scaler.fit_transform(resampled_data)

In [19]:
rfc = RandomForestClassifier(n_estimators=200)

#Train the model using the training sets 
rfc.fit(normalized_training, resampled_labels)

#Perform prediction on the test set
y_pred = rfc.predict(scaler.transform(validation_data))

print("Classification Report:", classification_report(validation_labels, y_pred))
print("Confusion Metrics :", confusion_matrix(validation_labels, y_pred))
print('F1 Score: ', f1_score(validation_labels, y_pred, average='micro'))
print('Precision Score:', precision_score(validation_labels, y_pred, average="micro"))
print("Accuracy: ", accuracy_score(validation_labels, y_pred))

Classification Report:               precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         2
         5.0       0.63      0.74      0.68        66
         6.0       0.53      0.48      0.50        65
         7.0       0.27      0.27      0.27        22
         8.0       0.00      0.00      0.00         4

    accuracy                           0.54       160
   macro avg       0.24      0.25      0.24       160
weighted avg       0.51      0.54      0.52       160

Confusion Metrics : [[ 0  0  1  0  0  0]
 [ 0  0  2  0  0  0]
 [ 0  0 49 13  4  0]
 [ 0  0 22 31 10  2]
 [ 0  0  4 12  6  0]
 [ 0  0  0  2  2  0]]
F1 Score:  0.5375
Precision Score: 0.5375
Accuracy:  0.5375


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

gs = gs.fit(normalized_training, resampled_labels)

In [21]:
print(gs.best_score_)
print(gs.best_params_)

0.7409111438682171
{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 400}


In [22]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(normalized_training, resampled_labels)
print("%.4f" % rf.oob_score_)

y_pred = rfc.predict(scaler.transform(validation_data))

print("Classification Report:", classification_report(validation_labels, y_pred))
print("Confusion Metrics :", confusion_matrix(validation_labels, y_pred))
print('F1 Score: ', f1_score(validation_labels, y_pred, average='micro'))
print('Precision Score:', precision_score(validation_labels, y_pred, average="micro"))
print("Accuracy: ", accuracy_score(validation_labels, y_pred))

0.7855
Classification Report:               precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         2
         5.0       0.63      0.74      0.68        66
         6.0       0.53      0.48      0.50        65
         7.0       0.27      0.27      0.27        22
         8.0       0.00      0.00      0.00         4

    accuracy                           0.54       160
   macro avg       0.24      0.25      0.24       160
weighted avg       0.51      0.54      0.52       160

Confusion Metrics : [[ 0  0  1  0  0  0]
 [ 0  0  2  0  0  0]
 [ 0  0 49 13  4  0]
 [ 0  0 22 31 10  2]
 [ 0  0  4 12  6  0]
 [ 0  0  0  2  2  0]]
F1 Score:  0.5375
Precision Score: 0.5375
Accuracy:  0.5375


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
scorer = make_scorer(f1_score, greater_is_better=True, average='macro')

cv_score = cross_val_score(rf, normalized_training, resampled_labels, cv=4, scoring=scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

10 Fold Cross Validation F1 Score = 0.8188 with std = 0.0151


In [24]:
y_pred = rf.predict(scaler.transform(validation_data))

print(confusion_matrix(validation_labels, y_pred))
print(classification_report(validation_labels, y_pred))
print(accuracy_score(validation_labels, y_pred))

[[ 0  0  0  1  0  0]
 [ 0  0  2  0  0  0]
 [ 0  0 48 14  4  0]
 [ 0  1 23 29  9  3]
 [ 0  0  4  9  9  0]
 [ 0  0  0  2  2  0]]
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         2
         5.0       0.62      0.73      0.67        66
         6.0       0.53      0.45      0.48        65
         7.0       0.38      0.41      0.39        22
         8.0       0.00      0.00      0.00         4

    accuracy                           0.54       160
   macro avg       0.25      0.26      0.26       160
weighted avg       0.52      0.54      0.53       160

0.5375


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
