Data Exploration

In [3]:
pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 
  
# metadata 
print(wine_quality.metadata) 
  
# variable information 
print(wine_quality.variables) 


{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

Splitting Data

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(url, sep=';')


# Reclassify wine quality into three categories
def classify_quality(quality):
    if quality <= 4:
        return 'Not Good'
    elif quality <= 6:
        return 'Average'
    else:
        return 'Good'

# Apply the classification to the target variable
data['quality_category'] = data['quality'].apply(classify_quality)

# Define features and target variable
X = data.drop(columns=['quality', 'quality_category'])
y = data['quality_category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Apply Min-Max Scaler and SMOTE

In [20]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# Apply Min-Max Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE (only on the training set)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_resampled, y_resampled)
y_pred = logistic_model.predict(X_test_scaled)

# Evaluation metrics
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression - Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regression - Accuracy: 0.58125
Logistic Regression - Classification Report:
               precision    recall  f1-score   support

     Average       0.95      0.53      0.68       395
        Good       0.39      0.84      0.53        67
    Not Good       0.13      0.83      0.23        18

    accuracy                           0.58       480
   macro avg       0.49      0.73      0.48       480
weighted avg       0.84      0.58      0.64       480



KNN

In [23]:
# K-Nearest Neighbors Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_resampled, y_resampled)
y_pred = knn_model.predict(X_test_scaled)

# Evaluation metrics
print("K-Nearest Neighbors - Accuracy:", accuracy_score(y_test, y_pred))
print("K-Nearest Neighbors - Classification Report:\n", classification_report(y_test, y_pred))

K-Nearest Neighbors - Accuracy: 0.6520833333333333
K-Nearest Neighbors - Classification Report:
               precision    recall  f1-score   support

     Average       0.93      0.64      0.76       395
        Good       0.40      0.78      0.53        67
    Not Good       0.11      0.50      0.19        18

    accuracy                           0.65       480
   macro avg       0.48      0.64      0.49       480
weighted avg       0.83      0.65      0.70       480



Random Forest

In [24]:
# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_model.fit(X_resampled, y_resampled)
y_pred = rf_model.predict(X_test_scaled)

# Evaluation metrics
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred))
print("Random Forest - Classification Report:\n", classification_report(y_test, y_pred))

Random Forest - Accuracy: 0.8145833333333333
Random Forest - Classification Report:
               precision    recall  f1-score   support

     Average       0.92      0.85      0.88       395
        Good       0.58      0.75      0.65        67
    Not Good       0.15      0.22      0.18        18

    accuracy                           0.81       480
   macro avg       0.55      0.61      0.57       480
weighted avg       0.84      0.81      0.82       480

