# Models

In [32]:
X = wine[["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "sulphates", "alcohol"]]
y = wine['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


## Grid search

In [33]:
def hyperp_search(classifier, parameters, cv):
    gs = GridSearchCV(classifier,
                      parameters,
                      cv=cv,
                      scoring = 'f1',
                      verbose=0,
                      n_jobs=-1
                      )
    tic = time.perf_counter()
    gs = gs.fit(X_train, y_train)
    toc = time.perf_counter()

    print("f1_train: %f using %s in %.3f seconds" % (gs.best_score_,
                                                     gs.best_params_,
                                                     toc - tic))

    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_train = best_model.predict(X_train)

    # evaluate predictions
    print("           train    test ")
    print("-------------------------")
    print("f1         %.3f    %.3f" % (f1_score(y_train, y_pred_train),
                                       f1_score(y_test, y_pred)))
    print("accuracy   %.3f    %.3f" % (accuracy_score(y_train, y_pred_train),
                                       accuracy_score(y_test, y_pred)))
    print("precision  %.3f    %.3f" % (precision_score(y_train, y_pred_train),
                                       precision_score(y_test, y_pred)))
    print("recall     %.3f    %.3f" % (recall_score(y_train, y_pred_train),
                                       recall_score(y_test, y_pred)))
    print("")


    # Calculate the ROC curve

    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr) # Calculate the area under the ROC curve

    # ROC curve plot

    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    fig.show()

    # Confusion matrix plot

    cm = confusion_matrix(y_test, y_pred)

    fig = px.imshow(
        cm,
        labels=dict(x="Predicted Label", y="True Label", color="Count"),
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        text_auto=True,
        color_continuous_scale="PuBu",
        title="Confusion Matrix",
        width=700, height=500
    )

    fig.show()

    return gs.best_estimator_

## Decision tree

In [34]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

In [35]:
classifier = DecisionTreeClassifier()
parameters = {'criterion': ['entropy','gini'],
              'max_depth': [4,5,6,8,10,12],
              'min_samples_split': [5,10,20],
              'min_samples_leaf': [5,10,20]}
cv = 5
best_tree = hyperp_search(classifier,parameters,cv)

# Finding the best feature

importance_tree = tree.export_text(best_tree,
                                   feature_names=X_test.columns.tolist(),
                                   max_depth=2)

print("-------------------------------")
print("The most valuable features are: ")
print("-------------------------------")
print(importance_tree)


f1_train: 0.753888 using {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 20} in 1.927 seconds
           train    test 
-------------------------
f1         0.784    0.757
accuracy   0.786    0.732
precision  0.830    0.798
recall     0.742    0.720



-------------------------------
The most valuable features are: 
-------------------------------
|--- alcohol <= 0.31
|   |--- fixed acidity <= 0.50
|   |   |--- sulphates <= 0.26
|   |   |   |--- truncated branch of depth 3
|   |   |--- sulphates >  0.26
|   |   |   |--- truncated branch of depth 3
|   |--- fixed acidity >  0.50
|   |   |--- sulphates <= 0.28
|   |   |   |--- class: 0
|   |   |--- sulphates >  0.28
|   |   |   |--- truncated branch of depth 3
|--- alcohol >  0.31
|   |--- volatile acidity <= 0.51
|   |   |--- alcohol <= 0.54
|   |   |   |--- truncated branch of depth 3
|   |   |--- alcohol >  0.54
|   |   |   |--- truncated branch of depth 3
|   |--- volatile acidity >  0.51
|   |   |--- volatile acidity <= 0.61
|   |   |   |--- class: 0
|   |   |--- volatile acidity >  0.61
|   |   |   |--- class: 0



The decision tree has found the best features in sulphates, chlorides, alcohol, volatile acidity and total sulfur dioxide

## Random forest

In [36]:
classifier = RandomForestClassifier()
parameters = {'criterion': ['entropy'],
              'n_estimators' : [50,100,500,1000],
              'max_depth': range(2,20,2),
              'min_samples_leaf':[100,250,500]}
cv = 5
best_rf = hyperp_search(classifier,parameters,cv)

# Finding the best feature

feature_importances = best_rf.feature_importances_
importance_rf = pd.DataFrame({'Feature': X_train.columns,
                              'Weight': feature_importances}).round(3)
importance_rf = importance_rf.sort_values(by='Weight', ascending=False)

# Plotting the features importance

fig = px.bar(importance_rf, x='Feature', y='Weight', color='Feature',
             title='Feature Importances for RandomForest Classifier',
             color_discrete_sequence=px.colors.sequential.Blues_r,
             text='Weight')

fig.update_layout(xaxis=dict(tickangle=90),
                  legend=dict(title='Feature'))

f1_train: 0.772273 using {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 100, 'n_estimators': 100} in 24.008 seconds
           train    test 
-------------------------
f1         0.774    0.776
accuracy   0.767    0.740
precision  0.785    0.776
recall     0.763    0.776



Based on MSI the most valuable features are alcohol, sulphates, total sulfur dioxide and volatile acidity.

## Logistic regression

In [37]:
classifier = LogisticRegression()
parameters = {"C":[1e-4,1e-3,1e-2,1,10,1000], "max_iter":[500,1000,5000] }
cv = 5
best_log = hyperp_search(classifier,parameters,cv)

# Finding the best features

importance_log = pd.DataFrame()
importance_log["feature"]=X_train.columns
importance_log["weight"]=best_log.coef_[0].round(2)
importance_log.sort_values(by=['weight'], inplace=True)

# Plotting the features

fig = px.bar(importance_log,
             x='feature',
             y='weight',
             color='feature',
             labels={'feature': 'Feature',
                     'weight': 'Weight'},
             title='Coefficient Analysis for Logistic Regression',
             color_discrete_sequence=px.colors.sequential.Blues_r,
             text='weight')

fig.update_layout(
    xaxis=dict(tickangle=90, categoryorder='total descending'),
    legend=dict(title='Feature')
)

f1_train: 0.771506 using {'C': 1000, 'max_iter': 500} in 0.226 seconds
           train    test 
-------------------------
f1         0.770    0.764
accuracy   0.761    0.729
precision  0.775    0.771
recall     0.765    0.757



The logistical regression model determined that the most valuable features are alcohol, sulphates, volatile acidity, total sulfur dioxide and fixed acidity

## Compare the different models

In [38]:
models = ["Logistic Regression",
          'Decision Tree',
          'Random Forest']

f1_scores = [f1_score(y_test, best_log.predict(X_test)),
             f1_score(y_test, best_tree.predict(X_test)),
             f1_score(y_test, best_rf.predict(X_test))]
accuracy_scores = [accuracy_score(y_test, best_log.predict(X_test)),
                   accuracy_score(y_test, best_tree.predict(X_test)),
                   accuracy_score(y_test, best_rf.predict(X_test))]

table = pd.DataFrame({"Model": models,
                      "F1 Score": f1_scores,
                      "Accuracy": accuracy_scores})

table = table.sort_values(by="F1 Score", ascending=False)

fig = go.Figure(data=[go.Table(
    header=dict(values=['Model',
                        'F1 Score',
                        'Accuracy']),
    cells=dict(values=[table["Model"],
                       table["F1 Score"].round(3),
                       table["Accuracy"].round(3)])
)])

fig.update_layout(
    title={
        'text': 'Comparing the models',
        'x': 0.5,
        'y': 0.9,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 15}
    },
    width=1000,
    height=400)