# Wine Quality Prediction

### Machine learning model to predict the quality of Wine using linear regression.
The dataset employed in this analysis originates from a Kaggle competition, and you can access it through the following [link](https://www.kaggle.com/datasets/yasserh/wine-quality-dataset?resource=download).

In [24]:
# Load the dataset
import pandas as pd
data = pd.read_csv("Dataset/WineQT.csv")
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


### Data Preprocessing

In [25]:
# missing value in the Dataset
display(data.isna().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64

In [26]:
wine_df  = data.drop(['fixed acidity', 'citric acid', 'density'], axis = 1)
wine_df

Unnamed: 0,volatile acidity,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality,Id
0,0.700,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5,0
1,0.880,2.6,0.098,25.0,67.0,3.20,0.68,9.8,5,1
2,0.760,2.3,0.092,15.0,54.0,3.26,0.65,9.8,5,2
3,0.280,1.9,0.075,17.0,60.0,3.16,0.58,9.8,6,3
4,0.700,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...
1138,0.510,2.3,0.076,29.0,40.0,3.42,0.75,11.0,6,1592
1139,0.620,1.9,0.068,28.0,38.0,3.42,0.82,9.5,6,1593
1140,0.600,2.0,0.090,32.0,44.0,3.45,0.58,10.5,5,1594
1141,0.550,2.2,0.062,39.0,51.0,3.52,0.76,11.2,6,1595


##### Divide the data into X (features) and Y (target)

In [21]:
import numpy as np
features = np.array(wine_df.loc[:, wine_df.columns != 'quality'])
target = np.array(wine_df['quality'])

In [67]:
# split the data into training and testing set
from sklearn.model_selection import train_test_split as tts
feature_train, feature_test, target_train, target_test = tts(features, target, test_size=0.2, random_state=42)

##### Features data

In [68]:
feature_train,feature_test

(array([[2.800e-01, 1.800e+00, 9.200e-02, ..., 7.500e-01, 1.050e+01,
         1.600e+01],
        [3.200e-01, 2.000e+00, 7.300e-02, ..., 7.300e-01, 1.140e+01,
         1.076e+03],
        [3.100e-01, 2.600e+00, 5.600e-02, ..., 6.900e-01, 1.180e+01,
         9.000e+02],
        ...,
        [3.500e-01, 2.400e+00, 6.800e-02, ..., 6.000e-01, 1.190e+01,
         1.580e+03],
        [5.700e-01, 2.000e+00, 7.900e-02, ..., 6.900e-01, 9.500e+00,
         1.216e+03],
        [5.200e-01, 2.200e+00, 6.000e-02, ..., 6.400e-01, 1.180e+01,
         1.575e+03]]),
 array([[6.10000000e-01, 1.50000000e+00, 5.70000000e-02, ...,
         6.00000000e-01, 9.50000000e+00, 2.22000000e+02],
        [8.40000000e-01, 4.10000000e+00, 7.40000000e-02, ...,
         7.20000000e-01, 9.23333333e+00, 1.51400000e+03],
        [5.80000000e-01, 1.90000000e+00, 9.10000000e-02, ...,
         4.80000000e-01, 1.05000000e+01, 4.17000000e+02],
        ...,
        [7.75000000e-01, 3.00000000e+00, 1.02000000e-01, ...,
         5

###### Target data

In [69]:
target_train, target_test

(array([7, 6, 5, 5, 5, 6, 7, 5, 6, 5, 6, 8, 6, 6, 5, 5, 6, 5, 5, 5, 5, 6,
        6, 6, 6, 5, 7, 7, 6, 5, 5, 6, 6, 7, 5, 5, 5, 5, 6, 5, 6, 6, 7, 5,
        6, 5, 6, 6, 6, 5, 5, 5, 7, 5, 4, 6, 7, 5, 5, 7, 6, 6, 5, 7, 6, 6,
        6, 7, 5, 6, 6, 5, 5, 6, 5, 4, 6, 7, 6, 6, 6, 6, 7, 6, 5, 5, 5, 6,
        5, 5, 6, 6, 5, 5, 5, 5, 6, 7, 5, 6, 6, 6, 5, 5, 7, 5, 4, 8, 6, 5,
        6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 7, 5, 5, 6, 6, 7, 5, 6, 6, 7, 6, 6,
        5, 7, 6, 6, 6, 7, 5, 6, 5, 5, 6, 5, 5, 5, 5, 8, 7, 6, 4, 5, 6, 7,
        5, 6, 7, 6, 6, 5, 6, 5, 6, 5, 5, 6, 6, 5, 8, 6, 6, 5, 5, 4, 6, 5,
        5, 6, 5, 5, 6, 7, 5, 6, 7, 7, 5, 6, 4, 4, 6, 5, 6, 7, 6, 6, 5, 6,
        5, 6, 6, 5, 6, 5, 5, 7, 6, 5, 5, 5, 4, 6, 6, 6, 6, 6, 6, 7, 3, 5,
        5, 5, 5, 5, 5, 5, 3, 5, 6, 6, 7, 5, 5, 6, 5, 7, 7, 6, 6, 6, 6, 4,
        5, 5, 6, 5, 5, 6, 6, 5, 6, 5, 6, 7, 5, 7, 5, 5, 6, 6, 7, 6, 6, 6,
        5, 5, 6, 6, 6, 7, 6, 6, 6, 7, 5, 5, 5, 5, 5, 7, 7, 6, 6, 5, 6, 5,
        6, 8, 6, 5, 6, 5, 6, 6, 6, 6, 

### Model

In [70]:
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression()

In [71]:
reg_model.fit(feature_train, target_train)

In [72]:
# Coefficients
reg_model.coef_

array([-1.20852123e+00, -1.21266757e-02, -2.15706919e+00,  3.69729516e-03,
       -3.01563077e-03, -3.91367095e-01,  8.89247896e-01,  2.92819706e-01,
       -7.02747557e-05])

In [73]:
# Intercept
reg_model.intercept_

4.31001023440868

### Prediction

In [74]:
quality_predict = reg_model.predict(feature_test)
quality_predict = np.round(quality_predict)
quality_predict

array([5., 5., 5., 5., 6., 7., 5., 5., 6., 5., 6., 6., 5., 6., 6., 5., 6.,
       6., 5., 6., 6., 6., 5., 7., 6., 5., 7., 6., 5., 6., 6., 6., 7., 6.,
       6., 5., 6., 6., 7., 7., 6., 5., 7., 5., 5., 5., 6., 6., 5., 6., 6.,
       5., 6., 6., 6., 6., 6., 6., 6., 5., 6., 6., 6., 6., 6., 5., 7., 5.,
       5., 6., 6., 5., 6., 6., 5., 5., 6., 6., 6., 5., 5., 6., 6., 5., 5.,
       6., 5., 6., 6., 5., 5., 6., 5., 5., 5., 5., 5., 7., 6., 6., 6., 6.,
       5., 6., 5., 5., 6., 6., 6., 7., 6., 5., 6., 6., 5., 5., 5., 7., 6.,
       5., 5., 5., 6., 7., 5., 6., 6., 6., 5., 6., 5., 5., 6., 6., 7., 5.,
       4., 6., 6., 6., 7., 5., 6., 5., 6., 5., 5., 5., 5., 5., 7., 5., 6.,
       5., 6., 6., 5., 6., 6., 6., 6., 5., 6., 6., 6., 5., 6., 6., 6., 5.,
       6., 5., 5., 6., 5., 5., 6., 6., 6., 5., 6., 6., 6., 5., 5., 6., 6.,
       5., 6., 6., 6., 6., 7., 6., 6., 5., 5., 7., 7., 6., 6., 6., 5., 6.,
       6., 6., 5., 5., 6., 6., 5., 6., 6., 6., 6., 5., 7., 5., 6., 6., 6.,
       5., 5., 6., 5., 6.

In [75]:
sum = 0
for i,n in enumerate(target_test):
    if n == quality_predict[i]:
        sum += 1
accuracy = (sum/len(target_test))*100
accuracy

64.62882096069869