# Wine Dataset Machine Learning Project

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import sqlalchemy
from sqlalchemy import create_engine
# Install necessary packages
%pip install dagshub mlflow

import mlflow
import mlflow.sklearn
from sklearn.base import BaseEstimator, TransformerMixin

# Install xgboost
%pip install xgboost

# Import XGBClassifier after installation
from xgboost import XGBClassifier

## Data Preprocessing

In [None]:
# Load the dataset
df = pd.read_csv('cleaned_wine_data.csv')
df.head()

In [6]:
# Handle missing values if any
# df.isnull().sum()

## Exploratory Data Analysis (EDA)

In [None]:
# Install ydata-profiling
%pip install ydata-profiling

# Import ProfileReport from ydata_profiling
from ydata_profiling import ProfileReport

# Generate a profile report using ydata_profiling
profile = ProfileReport(df, title='Wine Dataset Profile Report')
profile.to_notebook_iframe()

In [None]:
# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Observations

- **Features**: List the features and their types (numerical, categorical).
- **Distributions**: Note the distribution of each feature (normal, skewed, etc.).
- **Capped Values**: Identify any features with capped values.
- **Missing Values**: Check for any missing values and their proportion.

### Data Cleanup Tasks

- Handle missing values (if any).
- Normalize or standardize numerical features.
- Encode categorical features (if any).
- Remove or cap outliers (if necessary).

In [10]:
# Normalize or standardize the data if required
scaler = StandardScaler()
X = df.drop('Class', axis=1)
y = df['Class']
X_scaled = scaler.fit_transform(X)

## Database Creation

In [None]:
# Convert the dataset into a 3NF database
engine = create_engine('sqlite:///wine.db')
df.to_sql('wine', engine, index=False, if_exists='replace')

In [None]:
# Extract data from the database
df_extracted = pd.read_sql('SELECT * FROM wine', engine)
df_extracted.head()

## Machine Learning Experiments

In [55]:
# Perform train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Convert the destructured values into dataframes
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=['Class'])
y_test = pd.DataFrame(y_test, columns=['Class'])

## Experiment #1: Preprocessing Pipeline and Logistic Regression

In [ ]:
# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                ('scaler', StandardScaler()),
                                ('minmax', MinMaxScaler()),
                                ('log', FunctionTransformer(np.log1p))]), df.drop('Class', axis=1).columns)
    ]
)

# Define the pipeline with Logistic Regression
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

In [ ]:
# Perform cross-validation
cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1_macro')
print(f'Cross-validation mean F1-score: {cv_results.mean()}')
print(f'Cross-validation std F1-score: {cv_results.std()}')

In [ ]:
# Hyperparameter tuning
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1_macro')
grid_search.fit(X_train, y_train)
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation F1-score: {grid_search.best_score_}')

In [ ]:
# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Test set F1-score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

In [ ]:
# Log results in MLFlow on DagsHub
mlflow.set_tracking_uri('https://dagshub.com/username/repo_name.mlflow')
mlflow.set_experiment('Wine Dataset Experiment')

with mlflow.start_run():
    mlflow.log_param('classifier', 'Logistic Regression')
    mlflow.log_param('best_params', grid_search.best_params_)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('TP', conf_matrix[1, 1])
    mlflow.log_metric('TN', conf_matrix[0, 0])
    mlflow.log_metric('FP', conf_matrix[0, 1])
    mlflow.log_metric('FN', conf_matrix[1, 0])
    mlflow.sklearn.log_model(best_model, 'model')

## Experiment #2: Preprocessing Pipeline and Multiple Classifiers

In [None]:
# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Ridge Classifier': RidgeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier()
}

# Iterate over classifiers and log results
for name, classifier in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])

    # Perform cross-validation
    cv_results = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1_macro')
    print(f'{name} Cross-validation mean F1-score: {cv_results.mean()}')
    print(f'{name} Cross-validation std F1-score: {cv_results.std()}')

    # Fit the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f'{name} Test set F1-score: {f1}')
    print(f'{name} Confusion Matrix:\n{conf_matrix}')

    # Log results in MLFlow on DagsHub
    with mlflow.start_run():
        mlflow.log_param('classifier', name)
        mlflow.log_metric('f1_score', f1)
        mlflow.log_metric('TP', conf_matrix[1, 1])
        mlflow.log_metric('TN', conf_matrix[0, 0])
        mlflow.log_metric('FP', conf_matrix[0, 1])
        mlflow.log_metric('FN', conf_matrix[1, 0])
        mlflow.sklearn.log_model(pipeline, 'model')

## Experiment #3: Feature Engineering and Attribute Combination

In [25]:
# Define a custom transformer for feature engineering
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Example of feature engineering: create new features by combining existing ones
        X['alcohol_density'] = X['alcohol'] / X['density']
        X['total_acidity'] = X['fixed acidity'] + X['volatile acidity']
        return X

# Define the preprocessing pipeline with feature engineering
preprocessor_with_fe = Pipeline(steps=[('feature_engineering', FeatureEngineering()),
                                       ('scaler', StandardScaler())])

# Define the pipeline with Logistic Regression
pipeline_with_fe = Pipeline(steps=[('preprocessor', preprocessor_with_fe),
                                   ('classifier', LogisticRegression())])

In [None]:
# Perform cross-validation
cv_results_fe = cross_val_score(pipeline_with_fe, X_train, y_train, cv=10, scoring='f1_macro')
print(f'Feature Engineering Cross-validation mean F1-score: {cv_results_fe.mean()}')
print(f'Feature Engineering Cross-validation std F1-score: {cv_results_fe.std()}')

In [None]:
# Fit the model
pipeline_with_fe.fit(X_train, y_train)
y_pred_fe = pipeline_with_fe.predict(X_test)
f1_fe = f1_score(y_test, y_pred_fe, average='macro')
conf_matrix_fe = confusion_matrix(y_test, y_pred_fe)
print(f'Feature Engineering Test set F1-score: {f1_fe}')
print(f'Feature Engineering Confusion Matrix:\n{conf_matrix_fe}')

In [None]:
# Log results in MLFlow on DagsHub
with mlflow.start_run():
    mlflow.log_param('experiment', 'Feature Engineering')
    mlflow.log_metric('f1_score', f1_fe)
    mlflow.log_metric('TP', conf_matrix_fe[1, 1])
    mlflow.log_metric('TN', conf_matrix_fe[0, 0])
    mlflow.log_metric('FP', conf_matrix_fe[0, 1])
    mlflow.log_metric('FN', conf_matrix_fe[1, 0])
    mlflow.sklearn.log_model(pipeline_with_fe, 'model')

## Experiment #4: Perform Feature Selection

In [None]:
# Feature selection using Correlation Threshold
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df_reduced_corr = df.drop(columns=to_drop)
print(f'Features dropped due to high correlation: {to_drop}')

In [None]:
# Feature selection using Feature Importance
model = RandomForestClassifier()
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
selected_features = X.columns[indices][:10]  # Select top 10 features
df_reduced_importance = df[selected_features]
print(f'Selected features based on importance: {selected_features}')

In [None]:
# Feature selection using Variance Threshold
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
df_reduced_variance = selector.fit_transform(X)
print(f'Shape after variance threshold: {df_reduced_variance.shape}')

In [None]:
# Log results in MLFlow on DagsHub
with mlflow.start_run():
    mlflow.log_param('experiment', 'Feature Selection')
    mlflow.log_param('correlation_dropped_features', to_drop)
    mlflow.log_param('importance_selected_features', selected_features.tolist())
    mlflow.log_param('variance_threshold_shape', df_reduced_variance.shape)
    mlflow.sklearn.log_model(model, 'model')

## Experiment #5: Use PCA for Dimensionality Reduction

In [None]:
# Apply PCA for dimensionality reduction
from sklearn.decomposition import PCA

# Standardize the data before applying PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Create a scree plot
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.grid(True)
plt.show()

In [None]:
# Select the number of components that explain at least 95% of the variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
print(f'Number of components selected: {n_components}')

# Apply PCA with the selected number of components
pca = PCA(n_components=n_components)
X_reduced = pca.fit_transform(X_scaled)

In [None]:
# Perform train/test split with the reduced data
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Define the pipeline with Logistic Regression
pipeline_pca = Pipeline(steps=[('classifier', LogisticRegression())])

# Perform cross-validation
cv_results_pca = cross_val_score(pipeline_pca, X_train_reduced, y_train, cv=10, scoring='f1_macro')
print(f'PCA Cross-validation mean F1-score: {cv_results_pca.mean()}')
print(f'PCA Cross-validation std F1-score: {cv_results_pca.std()}')

In [None]:
# Fit the model
pipeline_pca.fit(X_train_reduced, y_train)
y_pred_pca = pipeline_pca.predict(X_test_reduced)
f1_pca = f1_score(y_test, y_pred_pca, average='macro')
conf_matrix_pca = confusion_matrix(y_test, y_pred_pca)
print(f'PCA Test set F1-score: {f1_pca}')
print(f'PCA Confusion Matrix:\n{conf_matrix_pca}')

In [None]:
# Log results in MLFlow on DagsHub
with mlflow.start_run():
    mlflow.log_param('experiment', 'PCA')
    mlflow.log_param('n_components', n_components)
    mlflow.log_metric('f1_score', f1_pca)
    mlflow.log_metric('TP', conf_matrix_pca[1, 1])
    mlflow.log_metric('TN', conf_matrix_pca[0, 0])
    mlflow.log_metric('FP', conf_matrix_pca[0, 1])
    mlflow.log_metric('FN', conf_matrix_pca[1, 0])
    mlflow.sklearn.log_model(pipeline_pca, 'model')

## Experiment #6: Custom Experiment with Polynomial Features

In [None]:
# Define a custom transformer for polynomial features
from sklearn.preprocessing import PolynomialFeatures

# Define the preprocessing pipeline with polynomial features
preprocessor_poly = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                                    ('scaler', StandardScaler())])

# Define the pipeline with Logistic Regression
pipeline_poly = Pipeline(steps=[('preprocessor', preprocessor_poly),
                                ('classifier', LogisticRegression())])

In [None]:
# Perform cross-validation
cv_results_poly = cross_val_score(pipeline_poly, X_train, y_train, cv=10, scoring='f1_macro')
print(f'Polynomial Features Cross-validation mean F1-score: {cv_results_poly.mean()}')
print(f'Polynomial Features Cross-validation std F1-score: {cv_results_poly.std()}')

In [None]:
# Fit the model
pipeline_poly.fit(X_train, y_train)
y_pred_poly = pipeline_poly.predict(X_test)
f1_poly = f1_score(y_test, y_pred_poly, average='macro')
conf_matrix_poly = confusion_matrix(y_test, y_pred_poly)
print(f'Polynomial Features Test set F1-score: {f1_poly}')
print(f'Polynomial Features Confusion Matrix:\n{conf_matrix_poly}')

In [None]:
# Log results in MLFlow on DagsHub
with mlflow.start_run():
    mlflow.log_param('experiment', 'Polynomial Features')
    mlflow.log_metric('f1_score', f1_poly)
    mlflow.log_metric('TP', conf_matrix_poly[1, 1])
    mlflow.log_metric('TN', conf_matrix_poly[0, 0])
    mlflow.log_metric('FP', conf_matrix_poly[0, 1])
    mlflow.log_metric('FN', conf_matrix_poly[1, 0])
    mlflow.sklearn.log_model(pipeline_poly, 'model')

## Experiment #7: Custom Experiment with SMOTE for Imbalanced Data

In [None]:
# Apply SMOTE for handling imbalanced data
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define the pipeline with Logistic Regression
pipeline_smote = Pipeline(steps=[('classifier', LogisticRegression())])

In [None]:
# Perform cross-validation
cv_results_smote = cross_val_score(pipeline_smote, X_train_smote, y_train_smote, cv=10, scoring='f1_macro')
print(f'SMOTE Cross-validation mean F1-score: {cv_results_smote.mean()}')
print(f'SMOTE Cross-validation std F1-score: {cv_results_smote.std()}')

In [None]:
# Fit the model
pipeline_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = pipeline_smote.predict(X_test)
f1_smote = f1_score(y_test, y_pred_smote, average='macro')
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)
print(f'SMOTE Test set F1-score: {f1_smote}')
print(f'SMOTE Confusion Matrix:\n{conf_matrix_smote}')

In [None]:
# Log results in MLFlow on DagsHub
with mlflow.start_run():
    mlflow.log_param('experiment', 'SMOTE')
    mlflow.log_metric('f1_score', f1_smote)
    mlflow.log_metric('TP', conf_matrix_smote[1, 1])
    mlflow.log_metric('TN', conf_matrix_smote[0, 0])
    mlflow.log_metric('FP', conf_matrix_smote[0, 1])
    mlflow.log_metric('FN', conf_matrix_smote[1, 0])
    mlflow.sklearn.log_model(pipeline_smote, 'model')

## Compare F1-scores of Different Experiments

In [None]:
# Collect F1-scores from different experiments
experiments = ['Logistic Regression', 'Ridge Classifier', 'Random Forest', 'XGBClassifier', 'Feature Engineering', 'PCA', 'Polynomial Features', 'SMOTE']
f1_scores = [f1, f1_fe, f1_pca, f1_poly, f1_smote]

# Plot the F1-scores
plt.figure(figsize=(12, 6))
sns.barplot(x=experiments, y=f1_scores)
plt.xlabel('Experiments')
plt.ylabel('F1-score')
plt.title('Comparison of F1-scores across Different Experiments')
plt.xticks(rotation=45)
plt.show()

## Save the Final Model

In [None]:
# Save the final model using joblib
import joblib

# Assuming the best model is the one from the SMOTE experiment
joblib.dump(pipeline_smote, 'final_model.joblib')
print('Final model saved as final_model.joblib')