# Wine Dataset Machine Learning Project

In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sqlalchemy
from sqlalchemy import create_engine
import dagshub


## Data Preprocessing

In [ ]:
# Load the dataset
df = pd.read_csv('cleaned_wine_data.csv')
df.head()

In [ ]:
# Handle missing values if any
# df.isnull().sum()

In [ ]:
# Normalize or standardize the data if required
scaler = StandardScaler()
X = df.drop('Class', axis=1)
y = df['Class']
X_scaled = scaler.fit_transform(X)

## Exploratory Data Analysis (EDA)

In [ ]:
# Perform EDA
sns.pairplot(df, hue='Class')
plt.show()

## Database Creation

In [ ]:
# Convert the dataset into a 3NF database
engine = create_engine('sqlite:///wine.db')
df.to_sql('wine', engine, index=False, if_exists='replace')

In [ ]:
# Extract data from the database
df_extracted = pd.read_sql('SELECT * FROM wine', engine)
df_extracted.head()

## Machine Learning Experiments

In [ ]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [ ]:
# Train and evaluate various models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{name} Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

## Logging Results

In [ ]:
# Log the results on DagsHub
# dagshub.init('username/repo_name')
# dagshub.log_metrics({'accuracy': accuracy_score(y_test, y_pred)})