In [122]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV,StratifiedKFold,KFold

In [61]:
dataset = pd.read_csv("./data/red-wine-dataset/wineQualityReds.csv")

In [62]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [63]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
Unnamed: 0              1599 non-null int64
fixed.acidity           1599 non-null float64
volatile.acidity        1599 non-null float64
citric.acid             1599 non-null float64
residual.sugar          1599 non-null float64
chlorides               1599 non-null float64
free.sulfur.dioxide     1599 non-null float64
total.sulfur.dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(2)
memory usage: 162.5 KB


In [64]:
dataset.drop(columns='Unnamed: 0',inplace=True)

In [65]:
X=dataset.iloc[:,0:11]
y=dataset.iloc[:,11]

#### Cross Validation with splitting xtrain & xtest in complete data

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
display(X_train.head())
X_train.shape,X_test.shape

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
140,8.4,0.745,0.11,1.9,0.09,16.0,63.0,0.9965,3.19,0.82,9.6
1232,7.6,0.43,0.29,2.1,0.075,19.0,66.0,0.99718,3.4,0.64,9.5
720,8.4,0.56,0.04,2.0,0.082,10.0,22.0,0.9976,3.22,0.44,9.6
77,6.8,0.785,0.0,2.4,0.104,14.0,30.0,0.9966,3.52,0.55,10.7
39,7.3,0.45,0.36,5.9,0.074,12.0,87.0,0.9978,3.33,0.83,10.5


((1199, 11), (400, 11))

In [67]:
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [68]:
X_train.shape

(1199, 11)

In [69]:
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [96]:
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5,verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.6s finished


In [71]:
print(all_accuracies)

[0.6375     0.64166667 0.6875     0.6875     0.69037657]


In [72]:
print(all_accuracies.mean())

0.6689086471408647


In [73]:
print(all_accuracies.std())

0.0240032376491525


####  Cross Validation with complete data

In [75]:
feature_scaler = StandardScaler()
X = feature_scaler.fit_transform(X)
X.shape

(1599, 11)

In [97]:
all_accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=5,verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


In [77]:
print(all_accuracies)

[0.540625  0.559375  0.628125  0.584375  0.5799373]


In [78]:
print(all_accuracies.mean())

0.5784874608150471


In [79]:
print(all_accuracies.std())

0.029324060281103285


#### Grid Search CrossValdation

In [80]:
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [93]:
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1,verbose=1)

In [94]:
gd_sr.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   55.2s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [85]:
best_parameters = gd_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 1000}


In [86]:
best_result = gd_sr.best_score_
print(best_result)

0.6739016736401673


#### Grid Search CrossValdation with passing complete Data

In [95]:
gd_sr.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [89]:
best_parameters = gd_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 300}


In [90]:
best_result = gd_sr.best_score_
print(best_result)

0.5784874608150471


In [99]:
gd_sr.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

#### Random search 

In [111]:
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [112]:
rand_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [113]:
rd_sr = RandomizedSearchCV(classifier,
                     hyperparameters,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1,verbose=1)

In [114]:
rd_sr.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   27.6s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [115]:
best_parameters = rd_sr.best_params_
print(best_parameters)

{'n_estimators': 300, 'criterion': 'gini', 'bootstrap': True}


In [116]:
best_result = rd_sr.best_score_
print(best_result)

0.6689086471408647


#### Randomsearch passing all the data

In [117]:
rd_sr.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   33.9s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [118]:
best_parameters = rd_sr.best_params_
print(best_parameters)

{'n_estimators': 300, 'criterion': 'gini', 'bootstrap': True}


In [119]:
best_result = rd_sr.best_score_
print(best_result)

0.5784874608150471


#### STARTIFIED & K FOLD

In [126]:
# prepare cross validation
print(cross_val_score(classifier, X, y, cv=5),cross_val_score(classifier, X, y, cv=5).mean())
print(cross_val_score(classifier, X, y, cv=KFold(n_splits=5)),cross_val_score(classifier, X, y, cv=KFold(n_splits=5)).mean())
print(cross_val_score(classifier, X, y, cv=StratifiedKFold(n_splits=5)),cross_val_score(classifier, X, y, cv=StratifiedKFold(n_splits=5)).mean())


[0.540625  0.559375  0.628125  0.584375  0.5799373] 0.5784874608150471
[0.596875   0.546875   0.575      0.55625    0.63322884] 0.5816457680250784
[0.540625  0.559375  0.628125  0.584375  0.5799373] 0.5784874608150471


In [127]:
# prepare cross validation
print(cross_val_score(classifier,X_train, y_train, cv=5),cross_val_score(classifier,X_train, y_train, cv=5).mean())
print(cross_val_score(classifier,X_train, y_train, cv=KFold(n_splits=5)),cross_val_score(classifier,X_train, y_train, cv=KFold(n_splits=5)).mean())
print(cross_val_score(classifier, X_train, y_train, cv=StratifiedKFold(n_splits=5)),cross_val_score(classifier, X_train, y_train, cv=StratifiedKFold(n_splits=5)).mean())


[0.6375     0.64166667 0.6875     0.6875     0.69037657] 0.6689086471408647
[0.6625     0.62916667 0.6625     0.67083333 0.68619247] 0.6622384937238494
[0.6375     0.64166667 0.6875     0.6875     0.69037657] 0.6689086471408647
