In [None]:
# Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from warnings import filterwarnings
filterwarnings(action='ignore')

# Load Dataset
wine = pd.read_csv("winequality-red.csv")
print("Successfully Imported Data!")
wine.head()

# Dataset Shape
print(wine.shape)

# Statistical Summary
wine.describe(include='all')

# Check for Missing Values
print(wine.isna().sum())

# Correlation Matrix
wine.corr()

# Group by Quality
wine.groupby('quality').mean()

# Data Visualization

# Countplots
sns.countplot(wine['quality'])
plt.show()

sns.countplot(wine['pH'])
plt.show()

sns.countplot(wine['alcohol'])
plt.show()

sns.countplot(wine['fixed acidity'])
plt.show()

sns.countplot(wine['volatile acidity'])
plt.show()

sns.countplot(wine['citric acid'])
plt.show()

sns.countplot(wine['density'])
plt.show()

# KDE Plot
sns.kdeplot(wine.query('quality > 2').quality)

# Distplot
sns.distplot(wine['alcohol'])

# Boxplot
wine.plot(kind='box', subplots=True, layout=(4,4), sharex=False)
plt.show()

# Density Plot
wine.plot(kind='density', subplots=True, layout=(4,4), sharex=False)
plt.show()

# Histogram
wine.hist(figsize=(10,10), bins=50)
plt.show()

# Heatmap
corr = wine.corr()
sns.heatmap(corr,annot=True)
plt.show()

# Pairplot
sns.pairplot(wine)
plt.show()

# Violinplot
sns.violinplot(x='quality', y='alcohol', data=wine)
plt.show()

# Feature Engineering: Create Target
wine['goodquality'] = [1 if x >= 7 else 0 for x in wine['quality']]
X = wine.drop(['quality','goodquality'], axis=1)
Y = wine['goodquality']

# Splitting the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Model Building
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('K Neighbors Classifier', KNeighborsClassifier()))
models.append(('Support Vector Classifier', SVC()))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier()))

names = []
scores = []

for name, model in models:
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    print(f"Model: {name}")
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    acc = accuracy_score(y_test, pred)
    print(f"Accuracy: {acc}")
    names.append(name)
    scores.append(acc)

# Compare Models
result = pd.DataFrame({'Model': names, 'Score': scores})
print(result)