In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv')

In [2]:
wine = df.copy()

### 請嘗試使用 scikit-learn 中「不同基本分類模型」，並且進行比較結果？

In [3]:
from sklearn.linear_model import LogisticRegression , LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
log = LogisticRegression(random_state = 0 , max_iter = 1000)
logcv = LogisticRegressionCV(random_state=0 , max_iter = 3000)
svc = SVC(random_state = 0)
decisiontree = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=3)
gaussian = GaussianNB()
rf = RandomForestClassifier(n_estimators=100)
models = [log , logcv , svc , decisiontree , knn , gaussian , rf]

In [5]:
model_names = ['Logistic Regression' , 'Logistic Regression CV' , 'SVC' , 'Decision Tree' , 'KNN' , 'Gaussian Navive' , 'Random Forest']
res = []
from sklearn.model_selection import cross_val_score

def score(model , x , y):
  return cross_val_score(model , x , y , cv=3 , scoring = 'accuracy').mean()

In [6]:
columns_X = list(set(wine.columns) - {'quality'})
columns_y = ['quality']

train_X = wine[columns_X]
train_y = wine[columns_y]

for num in range(len(models)):
    res.append({'Model':model_names[num] , 'Average Score': score(models[num] , train_X , train_y)})
    
res = pd.DataFrame(res)
res.sort_values('Average Score', ascending=False)

Unnamed: 0,Model,Average Score
1,Logistic Regression CV,0.572858
6,Random Forest,0.567855
0,Logistic Regression,0.561601
5,Gaussian Navive,0.541588
2,SVC,0.482802
3,Decision Tree,0.460913
4,KNN,0.424015


### 可以利用「sklearn.model_selection 下的 GridSearchCV(...)」進行參數的調整。

In [7]:
from sklearn.model_selection import GridSearchCV

wine2 = df.copy()

In [8]:
columns_X = list(set(wine2.columns) - {'quality'})
columns_y = ['quality']

train_X = wine2[columns_X]
train_y = wine2[columns_y]

In [10]:
# Logistic Regression
log_reg = LogisticRegression()

# 定義超參數範圍
param_grid = {
    'penalty' : ['l1', 'l2'], # 正則化參數
    'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000], # C參數
    'solver' : ['liblinear'] # 優化算法
}

# 創建 GridSearchCV 物件
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(train_X, train_y)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score:  0.5703564727954972


In [11]:
# RandomForestClassifier

rfc = RandomForestClassifier(random_state=42)

Parameters = {
  'max_depth' : [5, 10, 20],
  'n_estimators': [10, 50, 100, 150],
}

# 創建 GridSearchCV 物件
grid_search = GridSearchCV(estimator=rfc, param_grid=Parameters, cv=3, scoring='accuracy')
grid_search.fit(train_X, train_y)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'max_depth': 20, 'n_estimators': 150}
Best Score:  0.5791119449656035
