# Wine Dataset Machine Learning Project

In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import sqlalchemy
from sqlalchemy import create_engine
import dagshub
import mlflow
import mlflow.sklearn
from sklearn.base import BaseEstimator, TransformerMixin

## Data Preprocessing

In [ ]:
# Load the dataset
df = pd.read_csv('cleaned_wine_data.csv')
df.head()

In [ ]:
# Handle missing values if any
# df.isnull().sum()

## Exploratory Data Analysis (EDA)

In [ ]:
# Generate a profile report using yprofile
profile = yprofile.ProfileReport(df, title='Wine Dataset Profile Report')
profile.to_notebook_iframe()

In [ ]:
# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Observations

- **Features**: List the features and their types (numerical, categorical).
- **Distributions**: Note the distribution of each feature (normal, skewed, etc.).
- **Capped Values**: Identify any features with capped values.
- **Missing Values**: Check for any missing values and their proportion.

### Data Cleanup Tasks

- Handle missing values (if any).
- Normalize or standardize numerical features.
- Encode categorical features (if any).
- Remove or cap outliers (if necessary).

In [ ]:
# Normalize or standardize the data if required
scaler = StandardScaler()
X = df.drop('Class', axis=1)
y = df['Class']
X_scaled = scaler.fit_transform(X)

## Database Creation

In [ ]:
# Convert the dataset into a 3NF database
engine = create_engine('sqlite:///wine.db')
df.to_sql('wine', engine, index=False, if_exists='replace')

In [ ]:
# Extract data from the database
df_extracted = pd.read_sql('SELECT * FROM wine', engine)
df_extracted.head()

## Machine Learning Experiments

In [ ]:
# Perform train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

## Experiment #1: Preprocessing Pipeline and Logistic Regression

In [ ]:
# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.columns),
        ('minmax', MinMaxScaler(), X.columns),
        ('log', FunctionTransformer(np.log1p), X.columns)
    ]
)

# Define the pipeline with Logistic Regression
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

In [ ]:
# Perform cross-validation
cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1_macro')
print(f'Cross-validation mean F1-score: {cv_results.mean()}')
print(f'Cross-validation std F1-score: {cv_results.std()}')

In [ ]:
# Hyperparameter tuning
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1_macro')
grid_search.fit(X_train, y_train)
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation F1-score: {grid_search.best_score_}')

In [ ]:
# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Test set F1-score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

In [ ]:
# Log results in MLFlow on DagsHub
mlflow.set_tracking_uri('https://dagshub.com/username/repo_name.mlflow')
mlflow.set_experiment('Wine Dataset Experiment')

with mlflow.start_run():
    mlflow.log_param('classifier', 'Logistic Regression')
    mlflow.log_param('best_params', grid_search.best_params_)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('TP', conf_matrix[1, 1])
    mlflow.log_metric('TN', conf_matrix[0, 0])
    mlflow.log_metric('FP', conf_matrix[0, 1])
    mlflow.log_metric('FN', conf_matrix[1, 0])
    mlflow.sklearn.log_model(best_model, 'model')

## Experiment #2: Preprocessing Pipeline and Multiple Classifiers

In [ ]:
# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Ridge Classifier': RidgeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier()
}

# Iterate over classifiers and log results
for name, classifier in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])
    
    # Perform cross-validation
    cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1_macro')
    print(f'{name} Cross-validation mean F1-score: {cv_results.mean()}')
    print(f'{name} Cross-validation std F1-score: {cv_results.std()}')
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f'{name} Test set F1-score: {f1}')
    print(f'{name} Confusion Matrix:\n{conf_matrix}')
    
    # Log results in MLFlow on DagsHub
    with mlflow.start_run():
        mlflow.log_param('classifier', name)
        mlflow.log_metric('f1_score', f1)
        mlflow.log_metric('TP', conf_matrix[1, 1])
        mlflow.log_metric('TN', conf_matrix[0, 0])
        mlflow.log_metric('FP', conf_matrix[0, 1])
        mlflow.log_metric('FN', conf_matrix[1, 0])
        mlflow.sklearn.log_model(pipeline, 'model')

## Experiment #3: Feature Engineering and Attribute Combination

In [ ]:
# Define a custom transformer for feature engineering
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Example of feature engineering: create new features by combining existing ones
        X['alcohol_density'] = X['alcohol'] / X['density']
        X['total_acidity'] = X['fixed acidity'] + X['volatile acidity']
        return X

# Define the preprocessing pipeline with feature engineering
preprocessor_with_fe = Pipeline(steps=[('feature_engineering', FeatureEngineering()),
                                       ('scaler', StandardScaler())])

# Define the pipeline with Logistic Regression
pipeline_with_fe = Pipeline(steps=[('preprocessor', preprocessor_with_fe),
                                   ('classifier', LogisticRegression())])

In [ ]:
# Perform cross-validation
cv_results_fe = cross_val_score(pipeline_with_fe, X_train, y_train, cv=10, scoring='f1_macro')
print(f'Feature Engineering Cross-validation mean F1-score: {cv_results_fe.mean()}')
print(f'Feature Engineering Cross-validation std F1-score: {cv_results_fe.std()}')

In [ ]:
# Fit the model
pipeline_with_fe.fit(X_train, y_train)
y_pred_fe = pipeline_with_fe.predict(X_test)
f1_fe = f1_score(y_test, y_pred_fe, average='macro')
conf_matrix_fe = confusion_matrix(y_test, y_pred_fe)
print(f'Feature Engineering Test set F1-score: {f1_fe}')
print(f'Feature Engineering Confusion Matrix:\n{conf_matrix_fe}')

In [ ]:
# Log results in MLFlow on DagsHub
with mlflow.start_run():
    mlflow.log_param('experiment', 'Feature Engineering')
    mlflow.log_metric('f1_score', f1_fe)
    mlflow.log_metric('TP', conf_matrix_fe[1, 1])
    mlflow.log_metric('TN', conf_matrix_fe[0, 0])
    mlflow.log_metric('FP', conf_matrix_fe[0, 1])
    mlflow.log_metric('FN', conf_matrix_fe[1, 0])
    mlflow.sklearn.log_model(pipeline_with_fe, 'model')