RED WINE QUALITY PREDICTION

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("winequality-red.csv")
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
data.shape

(1599, 12)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [6]:
x = data.drop(columns="quality",axis=1)
y = data["quality"]

In [7]:
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [8]:
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [11]:
pip_lr = Pipeline([("scaler1",StandardScaler()),("lin_reg",LinearRegression())])
pip_l2r = Pipeline([("scaler2",StandardScaler()),("poly_features1",PolynomialFeatures(degree=2)),("quad_reg",LinearRegression())]) 
pip_l3r = Pipeline([("scaler3",StandardScaler()),("poly_features2",PolynomialFeatures(degree=3)),("cub_reg",LinearRegression())])
pip_l4r = Pipeline([("scaler4",StandardScaler()),("poly_features3",PolynomialFeatures(degree=4)),("poly_reg",LinearRegression())])
pip_logr = Pipeline([("scaler5",StandardScaler()),("log_reg",LogisticRegression())])
pip_rid = Pipeline([("scaler6",StandardScaler()),("ridge",Ridge(alpha=1.0,random_state=1))])
pip_lasso = Pipeline([("scaler7",StandardScaler()),("lasso",Lasso(alpha=1.0,random_state=1))])
pip_bay = Pipeline([("scaler8",StandardScaler()),("bayesian",BayesianRidge())])
pip_random = Pipeline([("rf_classifier",RandomForestClassifier())])
pip_grad = Pipeline([("grad_classifier",GradientBoostingClassifier())])
pip_svc = Pipeline([("scaler3",StandardScaler()),("svc",SVC())])

In [12]:
pipelines = [pip_lr,pip_l2r,pip_l3r,pip_l4r,pip_logr,pip_rid,pip_lasso,pip_bay,pip_random,pip_grad,pip_svc]
pipelines

[Pipeline(steps=[('scaler1', StandardScaler()), ('lin_reg', LinearRegression())]),
 Pipeline(steps=[('scaler2', StandardScaler()),
                 ('poly_features1', PolynomialFeatures()),
                 ('quad_reg', LinearRegression())]),
 Pipeline(steps=[('scaler3', StandardScaler()),
                 ('poly_features2', PolynomialFeatures(degree=3)),
                 ('cub_reg', LinearRegression())]),
 Pipeline(steps=[('scaler4', StandardScaler()),
                 ('poly_features3', PolynomialFeatures(degree=4)),
                 ('poly_reg', LinearRegression())]),
 Pipeline(steps=[('scaler5', StandardScaler()),
                 ('log_reg', LogisticRegression())]),
 Pipeline(steps=[('scaler6', StandardScaler()),
                 ('ridge', Ridge(random_state=1))]),
 Pipeline(steps=[('scaler7', StandardScaler()),
                 ('lasso', Lasso(random_state=1))]),
 Pipeline(steps=[('scaler8', StandardScaler()), ('bayesian', BayesianRidge())]),
 Pipeline(steps=[('rf_classifier', Ra

In [13]:
for p in pipelines:
    p.fit(x_train,y_train)

In [14]:
pip_lr.score(x_test,y_test)*100


32.6640272698683

In [15]:
pip_l2r.score(x_test,y_test)*100

29.82376797500882

In [16]:
pip_l3r.score(x_test,y_test)*100

-154.68120675138496

In [17]:
pip_l4r.score(x_test,y_test)*100

-2959344.3311860445

In [18]:
pip_logr.score(x_test,y_test)*100

58.75

In [19]:
pip_rid.score(x_test,y_test)*100

32.672220168687026

In [20]:
pip_lasso.score(x_test,y_test)*100

-0.920963508656536

In [21]:
pip_bay.score(x_test,y_test)*100

32.846614373479

In [22]:
pip_random.score(x_test,y_test)*100

70.9375

In [23]:
pip_grad.score(x_test,y_test)*100

66.875

In [24]:
pip_svc.score(x_test,y_test)*100

61.25000000000001

In [25]:
model = RandomForestClassifier()

In [26]:
model.fit(x_train,y_train)

In [27]:
y_pred = model.predict(x_test)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
acc = accuracy_score(y_pred,y_test)
acc

0.7125

In [30]:
input_data = (7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0)

input_data_array = np.asarray(input_data)

input_reshaped_data = input_data_array.reshape(1,-1)

output = model.predict(input_reshaped_data)

print("The quality of red wine is {}".format(output[0]))

The quality of red wine is 7




In [31]:
import pickle

In [32]:
pickle.dump(model,open("winequality.pkl","wb"))

In [33]:
pickled_model = pickle.load(open("winequality.pkl","rb"))

In [34]:
input_data = (7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0)

input_data_array = np.asarray(input_data)

input_reshaped_data = input_data_array.reshape(1,-1)

output =pickled_model.predict(input_reshaped_data)

print("The quality of red wine is {}".format(output[0]))

The quality of red wine is 7


