In [90]:
import pandas as pd
rw = pd.read_csv('data/winequality-red.csv')
rw['target'] = rw.quality.apply(lambda x: 1 if x >= 6 else 0)
X = rw.iloc[:, :-2].values
y = rw.target.values
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)

### SVC = 0.790625

In [92]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC(probability=True, random_state=2023)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.790625

### SVC + GSCV = 0.79375

In [93]:
params = {'C': [0.98, 0.981, 0.982]}
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)
grid_svc.best_params_

{'C': 0.981}

In [94]:
grid_svc.best_estimator_.score(X_test, y_test)

0.79375

### KNN = 0.734375

In [95]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.734375

### LRC = 0.778125

In [96]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.778125

In [97]:
params = {'C': [0.87, 0.89, 0.9]}
grid_lrc = GridSearchCV(lrc, params, scoring='accuracy', cv=5)
grid_lrc.fit(X_train, y_train)
grid_lrc.best_params_

{'C': 0.87}

In [98]:
grid_lrc.score(X_test, y_test)

0.778125

### VOC = 0.784375

In [99]:
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)],
    voting='hard'
)
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.784375

### VOC2 = 0.796875

In [100]:
svc2 = SVC(probability=True, random_state=2023)
svc2.fit(X_train, y_train)
voc2 = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc2), ('KNN', knn)],
    voting='soft'
)
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

0.796875

### VOC2 + GSCV = 0.803125

In [101]:
params = {
    'LRC__C': [0.1, 1, 10],
    'SVC__C': [0.1, 1, 10]
}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 0.1, 'SVC__C': 10}

In [102]:
grid_voc2.best_estimator_.score(X_test, y_test)

0.803125

### RFC = 0.86875

In [118]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.86875

### XGB = 0.840625

In [106]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

0.840625

In [107]:
params = {
    'max_depth': [18, 19, 20]
}
grid_search = GridSearchCV(xgb, params, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': 19}

In [108]:
grid_search.score(X_test, y_test)

0.840625

### XGB + GSCV = 0.840625

In [109]:
params = {'max_depth': [5, 6, 7]}
grid_xgb = GridSearchCV(xgb, params, scoring='accuracy', cv=5)
grid_xgb.fit(X_train, y_train)
grid_xgb.best_params_

{'max_depth': 6}

In [110]:
grid_xgb.best_estimator_.score(X_test, y_test)

0.840625

### LGB = 0.846875

In [111]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier()
evals = [(X_test, y_test)]
lgb.fit(X_train, y_train, eval_set=evals, eval_metric='logloss', verbose=True)

[1]	valid_0's binary_logloss: 0.653161
[2]	valid_0's binary_logloss: 0.625011
[3]	valid_0's binary_logloss: 0.59498
[4]	valid_0's binary_logloss: 0.570736
[5]	valid_0's binary_logloss: 0.548624
[6]	valid_0's binary_logloss: 0.529952
[7]	valid_0's binary_logloss: 0.517124
[8]	valid_0's binary_logloss: 0.503533
[9]	valid_0's binary_logloss: 0.491252
[10]	valid_0's binary_logloss: 0.480848
[11]	valid_0's binary_logloss: 0.470929
[12]	valid_0's binary_logloss: 0.459038
[13]	valid_0's binary_logloss: 0.450861
[14]	valid_0's binary_logloss: 0.447485
[15]	valid_0's binary_logloss: 0.440419
[16]	valid_0's binary_logloss: 0.438103
[17]	valid_0's binary_logloss: 0.434231
[18]	valid_0's binary_logloss: 0.431824
[19]	valid_0's binary_logloss: 0.429003
[20]	valid_0's binary_logloss: 0.42519
[21]	valid_0's binary_logloss: 0.42258
[22]	valid_0's binary_logloss: 0.419985
[23]	valid_0's binary_logloss: 0.417877
[24]	valid_0's binary_logloss: 0.416492
[25]	valid_0's binary_logloss: 0.412202
[26]	valid_0



In [112]:
lgb.score(X_test, y_test)

0.846875

### X_mm = 0.76875

In [113]:
from sklearn.preprocessing import MinMaxScaler
X_mm = MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_mm, y, stratify=y, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.76875

### DTC = 0.734375

In [114]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)

0.734375