In [101]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [78]:
data = pd.read_csv("Downloads/wine_train.csv")

In [79]:
data.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,9.0,0.245,0.38,5.9,0.045,52.0,159.0,0.995,2.93,0.35,10.2,6
1,8.2,0.42,0.29,4.1,0.03,31.0,100.0,0.9911,3.0,0.32,12.8,7
2,6.4,0.22,0.32,7.2,0.028,15.0,83.0,0.993,3.13,0.55,10.9,8
3,5.0,0.35,0.25,7.8,0.031,24.0,116.0,0.99241,3.39,0.4,11.3,6
4,7.4,0.3,0.3,5.2,0.053,45.0,163.0,0.9941,3.12,0.45,10.3,6


In [80]:
data['quality'].value_counts()

6    1647
5    1071
7     671
8     143
4     123
3      14
9       4
Name: quality, dtype: int64

In [81]:
data['quality'].unique()

array([6, 7, 8, 5, 3, 4, 9])

In [82]:
bins = (2,5,6,9)
group_names = ['bad','normal','good']
data['quality'] = pd.cut(data['quality'], bins = bins, labels = group_names)

In [83]:
data['quality'].value_counts()

normal    1647
bad       1208
good       818
Name: quality, dtype: int64

In [84]:
data['quality'].unique()

['normal', 'good', 'bad']
Categories (3, object): ['bad' < 'normal' < 'good']

In [51]:
# label_quality = LabelEncoder()

In [52]:
#Bad becomes 0 and good becomes 1
# data['quality'] = label_quality.fit_transform(data['quality'])

In [91]:
data['quality'].value_counts()

normal    1647
bad       1208
good       818
Name: quality, dtype: int64

In [85]:
#Now seperate the dataset as response variable and feature variabes
X = data.drop('quality', axis = 1)
y = data['quality']

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [87]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()

In [88]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [89]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [90]:
#Let's see how our model performed
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

         bad       0.75      0.71      0.73       246
        good       0.72      0.58      0.64       168
      normal       0.63      0.72      0.67       321

    accuracy                           0.68       735
   macro avg       0.70      0.67      0.68       735
weighted avg       0.69      0.68      0.68       735



In [92]:
print(confusion_matrix(y_test, pred_rfc))

[[175   2  69]
 [  4  97  67]
 [ 54  36 231]]


In [134]:
rfc=RandomForestClassifier(random_state=42)

In [136]:
param_grid = { 
    'n_estimators': [200,250,300,350,400,450,500,550,600,650,700],
    'max_depth' : [10,15,20,25,30,35,40,45,50,None],
}

In [137]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [10, 15, 20, 25, 30, 35, 40, 45, 50,
                                       None],
                         'n_estimators': [200, 250, 300, 350, 400, 450, 500,
                                          550, 600, 650, 700]})

In [138]:
CV_rfc.best_params_

{'max_depth': 25, 'n_estimators': 400}

In [139]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 400, max_depth=25, criterion='gini')

In [140]:
rfc1.fit(X_train, y_train)

RandomForestClassifier(max_depth=25, n_estimators=400, random_state=42)

In [141]:
pred=rfc1.predict(X_test)

In [119]:
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)

In [142]:
from sklearn.metrics import accuracy_score
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

Accuracy for Random Forest on CV data:  0.6870748299319728


In [131]:
svc =  SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc.fit(X_train, y_train)
pred = svc.predict(X_test)

In [133]:
rfc_eval = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
rfc_eval.mean()

0.6861937266374126