#### Wine Quality Prediction

In [None]:
#Install required packages
%pip install numpy pandas matplotlib seaborn xgboost scikit-learn

In [None]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#loading the dataset and checking the basic information about the dataset
df = pd.read_csv('winequalityN.csv')
df.head()
df.tail()
df.info()
df.describe().T
df.isnull().sum()
df['type'].unique()

In [None]:
#handling missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())

In [None]:
#data visualization

df.hist(figsize=(20, 10))

In [None]:
# Encode 'type' column to numeric for correlation
if df['type'].dtype == 'object':
    df['type'] = df['type'].map({'white': 0, 'red': 1})
df.corr()

In [None]:
# Correlation heatmap   
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f",
linewidths=0.5)

In [None]:
#checking unique values in target column
df.quality.unique() #array([6, 5, 7, 8, 4, 3, 9])

In [None]:
# quality column is having multiple values so we will convert it into binary classification
df['best quality'] = [1 if x > 5 else 0 for x in df['quality']]

In [None]:
#dropping the original quality column
df = df.drop(columns='quality')

In [None]:
# Encoding categorical variable 'type' using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [None]:
#splitting the dataset into train and test
y = df['best quality']
X = df.drop(columns='best quality')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=89)
X_train.shape, X_test.shape # (5197, 12), (1300, 12))
                                                

In [None]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler()
X_train = norm.fit_transform(X_train)
X_test = norm.fit_transform(X_test)

In [None]:
#model building using Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_prediction = lr_model.predict(X_test)
from sklearn.metrics import accuracy_score
lr_prediction 
lr_accuracy = accuracy_score(y_test, lr_prediction)
lr_accuracy

Finally, I used the Support Vector Classifier.


In [None]:
#model building using Decision Tree Classifier
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_prediction = svc_model.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_prediction)
svc_accuracy