In [1]:
import pandas as pd

  # Load the datasets
red_wine_data_set = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")
white_wine_data_set = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";")




# 2. Adding the "quality_label" and "wine_type" Columns:

   # Add the "quality_label" column
red_wine_data_set['quality_label'] = red_wine_data_set['quality'].apply(lambda value: 'low' if value <= 5 else ('medium' if value <= 7 else 'high'))
white_wine_data_set['quality_label'] = white_wine_data_set['quality'].apply(lambda value: 'low' if value <= 5 else ('medium' if value <= 7 else 'high'))

   # Add the "wine_type" column
red_wine_data_set['wine_type'] = 'red'
white_wine_data_set['wine_type'] = 'white'




# 3. Combining the Datasets:
combined_wine = pd.concat([white_wine_data_set, red_wine_data_set], axis=0) #  axis=0 indicates that the dataframes should be concatenated vertically




# 4. Encoding Categorical Features:
   # Since machine learning models like Logistic Regression expect numerical input, 
   # we encode the categorical features 'quality_label' and 'wine_type' using one-hot encoding.
   # This converts them into binary columns.


 # Encode "quality_label" using .cat.codes
 # low: 1 / medium: 2 / high: 0
combined_wine['quality_label_encoded'] = combined_wine['quality_label'].astype('category').cat.codes

 # Encode "wine_type" using .cat.codes
 # White: 1 / Red: 0
combined_wine['wine_type_encoded'] = combined_wine['wine_type'].astype('category').cat.codes


 # Identify categorical columns
categorical_columns_to_drop = ['wine_type', 'quality_label']


 # Drop categorical columns
combined_wine = combined_wine.drop(categorical_columns_to_drop, axis=1)





# 5. Splitting the Data for Machine Learning:
  # We split the data into features (X) and the target variable (y), which is 'wine_type_white' indicating whether the wine is white (1) or not (0).
  # The dataset is divided into training (80%) and testing (20%) sets.
from sklearn.model_selection import train_test_split

Xq = combined_wine.drop('quality_label_encoded', axis=1)  # Features
yq = combined_wine['quality_label_encoded']  # Target variable

Xq_train, Xq_test, yq_train, yq_test = train_test_split(Xq, yq, test_size=0.2, random_state=42)

'''
 # To exclude many columns when setting your training features
  # List of columns to exclude
columns_to_exclude = ['volatile acidity', 'citric acid', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'fixed acidity', 'residual sugar']

  # Select columns for training by excluding the ones in the list
X = combined_wine.drop(columns=columns_to_exclude)

  # Your target variable
y = combined_wine['wine_type_encoded']

'''




# 6. Applying Logistic Regression:
  # We import the Logistic Regression model from scikit-learn and create an instance of the model.
  # Then, we fit the model to the training data.
  # Fitting the model means training the model on training data using the .fit method provided in sklearn.
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(Xq_train, yq_train)


# 7. Evaluating the Model:
  # To assess the model's performance, we use it to make predictions on the test data.
  # We calculate accuracy, confusion matrix, and a classification report to provide more detailed metrics.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

yq_pred = model.predict(Xq_test)

accuracy = accuracy_score(yq_test, yq_pred)
confusion = confusion_matrix(yq_test, yq_pred)
report = classification_report(yq_test, yq_pred)


print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", report)


# classification_report and confusion_matrix are two commonly used tools in machine learning 
# for evaluating the performance of classification models, 
# especially in tasks where you are predicting categorical outcomes (i.e., classes or labels). 
# They provide valuable insights into how well your model is performing and where it might be making errors. 
# These tools are typically used in supervised learning tasks, such as binary or multiclass classification.

Accuracy: 0.9715384615384616
Confusion Matrix:
 [[  9   0  23]
 [  0 464   4]
 [  6   4 790]]
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.28      0.38        32
           1       0.99      0.99      0.99       468
           2       0.97      0.99      0.98       800

    accuracy                           0.97      1300
   macro avg       0.85      0.75      0.78      1300
weighted avg       0.97      0.97      0.97      1300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
# to evaluate machine learning models through cross-validation
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Assuming you've loaded and preprocessed your dataset, X and y should be defined here.

# Create a logistic regression model
model = LogisticRegression()

# Perform 5-fold cross-validation with accuracy as the scoring metric
scores = cross_val_score(model, Xq, yq, cv=5, scoring='accuracy')

# View the results
print("Accuracy Scores for Each Fold:", scores)
print("Mean Accuracy:", np.mean(scores))
print("Standard Deviation of Accuracy:", np.std(scores))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy Scores for Each Fold: [0.95923077 0.94307692 0.93148576 0.90839107 0.926097  ]
Mean Accuracy: 0.9336563036655414
Standard Deviation of Accuracy: 0.016988159185775238


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
