#Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('/content/datasets_4458_8204_winequality-red.csv')

In [3]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
dataset['quality'].unique()

array([5, 6, 7, 4, 8, 3])

#Separate Features and Target
* Here we are considering 'quality' as target
* and rest of the columns as features




In [5]:
x = dataset.iloc[: , 1:-1].values
y = dataset.iloc[:, -1].values

In [6]:
x

array([[ 0.7  ,  0.   ,  1.9  , ...,  3.51 ,  0.56 ,  9.4  ],
       [ 0.88 ,  0.   ,  2.6  , ...,  3.2  ,  0.68 ,  9.8  ],
       [ 0.76 ,  0.04 ,  2.3  , ...,  3.26 ,  0.65 ,  9.8  ],
       ...,
       [ 0.51 ,  0.13 ,  2.3  , ...,  3.42 ,  0.75 , 11.   ],
       [ 0.645,  0.12 ,  2.   , ...,  3.57 ,  0.71 , 10.2  ],
       [ 0.31 ,  0.47 ,  3.6  , ...,  3.39 ,  0.66 , 11.   ]])

In [7]:
y

array([5, 5, 5, ..., 6, 5, 6])

#Splitting Data into Train and Test sets

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20)


#Standardization

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
sc = StandardScaler()

In [12]:
xn_train = sc.fit_transform(x_train)
xn_test = sc.transform(x_test)

#Train on different algorithms
We totally trained on 5 algorithms
* LogisticRegression class we have used linear_model library
* KNeighborsClassifier class we have used neighbors library
* DecisionTreeClassifier class we have used tree library
* RandomForestClassifier class we have used ensemble library
* SVC class we have used svm library

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [14]:
l_cla = LogisticRegression()
k_cla = KNeighborsClassifier()
d_cla = DecisionTreeClassifier()
r_cla = RandomForestClassifier()
s_cla = SVC(kernel='linear')
ks_cla = SVC(kernel= 'rbf')

In [15]:
l_cla.fit(x_train, y_train)
k_cla.fit(x_train, y_train)
d_cla.fit(x_train, y_train)
r_cla.fit(x_train, y_train)
s_cla.fit(x_train, y_train)
ks_cla.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [16]:
l_pred = l_cla.predict(x_test)
k_pred = k_cla.predict(x_test)
d_pred = d_cla.predict(x_test)
r_pred = r_cla.predict(x_test)
s_pred = s_cla.predict(x_test)
ks_pred = ks_cla.predict(x_test)

In [17]:
from sklearn.metrics import confusion_matrix

In [18]:
l_c = confusion_matrix(y_test, l_pred)
k_c = confusion_matrix(y_test, k_pred)
d_c = confusion_matrix(y_test, d_pred)
r_c = confusion_matrix(y_test, r_pred)
s_c = confusion_matrix(y_test, s_pred)
ks_c = confusion_matrix(y_test, ks_pred)

In [19]:
l_c

array([[  0,   0,   4,   0,   0,   0],
       [  0,   0,   9,   4,   0,   0],
       [  0,   0, 109,  41,   1,   0],
       [  0,   0,  33,  80,   6,   0],
       [  0,   0,   2,  25,   4,   0],
       [  0,   0,   0,   1,   1,   0]])

In [20]:
k_c

array([[ 0,  0,  4,  0,  0,  0],
       [ 0,  0,  8,  5,  0,  0],
       [ 0,  3, 98, 46,  4,  0],
       [ 0,  2, 38, 73,  6,  0],
       [ 0,  1,  5, 15, 10,  0],
       [ 0,  0,  1,  1,  0,  0]])

In [21]:
d_c

array([[  0,   1,   1,   2,   0,   0],
       [  1,   0,   6,   5,   1,   0],
       [  0,   4, 102,  39,   6,   0],
       [  1,   1,  17,  82,  17,   1],
       [  0,   0,   1,   8,  22,   0],
       [  0,   0,   0,   1,   1,   0]])

In [22]:
r_c

array([[  0,   1,   3,   0,   0,   0],
       [  1,   0,   9,   3,   0,   0],
       [  0,   0, 117,  31,   3,   0],
       [  0,   0,  12,  95,  12,   0],
       [  0,   0,   1,   9,  21,   0],
       [  0,   0,   0,   1,   1,   0]])

In [23]:
s_c

array([[  0,   0,   4,   0,   0,   0],
       [  0,   0,   9,   4,   0,   0],
       [  0,   0, 113,  38,   0,   0],
       [  0,   0,  35,  84,   0,   0],
       [  0,   0,   2,  29,   0,   0],
       [  0,   0,   0,   2,   0,   0]])

In [24]:
ks_c

array([[ 0,  0,  1,  3,  0,  0],
       [ 0,  0,  4,  9,  0,  0],
       [ 0,  0, 69, 82,  0,  0],
       [ 0,  0, 24, 95,  0,  0],
       [ 0,  0,  3, 28,  0,  0],
       [ 0,  0,  1,  1,  0,  0]])

#Result

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
l_a = accuracy_score(y_test, l_pred)
k_a = accuracy_score(y_test, k_pred)
d_a = accuracy_score(y_test, d_pred)
r_a = accuracy_score(y_test, r_pred)
s_a = accuracy_score(y_test, s_pred)
ks_a = accuracy_score(y_test, ks_pred)

In [27]:
print('Logistic Regression: ' + str(l_a) + '\nKNN: ' + str(k_a) + '\nDecision Tree: ' + str(d_a) + '\nRandom Forest: ' + str(r_a) + '\nLinear SVC: ' + str(s_a) + '\nKernel SVC: ' + str(l_a))

Logistic Regression: 0.603125
KNN: 0.565625
Decision Tree: 0.64375
Random Forest: 0.728125
Linear SVC: 0.615625
Kernel SVC: 0.603125




#Conclusion
done by using Classification

* Since RandomForest is giving more efficiency than other algorithm
* We can consider RandomForest as the best algorithm for this dataset