# Red wine quality prediction using Random Forest -hyperparameter tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('winequality-red.csv')


In [6]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
df['quality'].unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None


In [12]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [14]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [19]:
from sklearn.model_selection import train_test_split as tts

x_train,x_test,y_train,y_test = tts(X,Y,test_size = .2,random_state = 42+1)

Model building

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
classifier = RandomForestClassifier(n_estimators = 10,criterion = 'entropy')
#n_estimators = number of deciosin tree
#criterion[entropy] = a measure of disorder or impurity in a node
#criterion[gini] =randomness or the impurity or entropy in the values of a dataset
classifier.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10)

In [30]:
y_pred = classifier.predict(x_test)

Performance

In [45]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [47]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[  0,   1,   1,   1,   0,   0],
       [  0,   0,   2,   2,   0,   0],
       [  0,   3, 103,  25,   0,   0],
       [  0,   0,  36,  90,  10,   0],
       [  0,   0,   3,  21,  20,   1],
       [  0,   0,   0,   0,   0,   1]], dtype=int64)

In [49]:
score = accuracy_score(y_test,y_pred)
score

0.66875

above is very bad result to enhance the accuracy hypermeter tuning comes into the picture

In [51]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         4
           5       0.71      0.79      0.75       131
           6       0.65      0.66      0.65       136
           7       0.67      0.44      0.53        45
           8       0.50      1.00      0.67         1

    accuracy                           0.67       320
   macro avg       0.42      0.48      0.43       320
weighted avg       0.66      0.67      0.66       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## fine tuning using RandomSearchCV

In [54]:
#It is more computationally efficient than GridSearchCV because it samples a subset of hyperparameter combinations, making it suitable for large hyperparameter spaces.
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.classifier import RandomForest/

In [55]:
from scipy.stats import randint

In [57]:
param_dist ={
   'n_estimators': randint(100,500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),    
    
}

In [60]:
rf_classifier = RandomForestClassifier()
random_search = RandomizedSearchCV(rf_classifier,
                                  param_distributions = param_dist,
                                  n_iter = 10,cv = 5 
                                   
                                  )

In [61]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': [None, 10, 20, 30],
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000200E9F360A0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000200E9C8E1F0>})

model accuracy after tuning

In [64]:
best_params = random_search.best_params_

In [67]:
print(best_params)

{'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 344}


In [68]:
best_score = random_search.best_score_
print(best_score)

0.6841084558823529


In [69]:
best_model = random_search.best_estimator_