# Import the dataset

In [1]:
# Import pandas library and the data set
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sb
# %matplotlib inline

df = pd.read_csv('winequality-red.csv')

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# Check if any columns have missing data
df.columns[df.isnull().any()]

Index([], dtype='object')

In [4]:
X = df.drop(columns='quality')
y = df.quality

In [14]:
# Retrieve numeric features from columns
numeric_features = X._get_numeric_data().columns

# Retrieve categorical features from columns
categorical_features = list(set(X.columns) - set(numeric_features))

In [15]:
print(numeric_features, categorical_features)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object') []


In [16]:
# StandardScale the dataset and convert it back to dataframe type
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_X = sc.fit_transform(X)
X = pd.DataFrame(scaled_X, index=X.index, columns=X.columns)

### Split into train and test sets

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

### Use XGBRegressor to predict the quality of wine

In [18]:
# Import XGBGeressor and GridSearchCV libraries
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# parameters = {'objective':['reg:linear'],
#               'learning_rate': [.01, 0.05, 0.1, 0.2], #so called `eta` value
#               'gamma': [0.5, 1, 1.5, 2, 5],
#               'subsample': [0.6, 0.8, 1.0],
#               'min_child_weight': [1, 5, 10],
#               'silent': [1],
#               'n_estimators': [500, 600, 700],
#                 'colsample_bytree': [0.6, 0.8, 1.0],
#                 'max_depth': [3, 4, 5],
#                 'reg_alpha': [1.1, 1.2, 1.3],
#                 'reg_lambda': [1.1, 1.2, 1.3],
#                 'subsample': [0.7, 0.8, 0.9]
#              }

parameters = {'colsample_bytree': [1.0], 'gamma': [0.5], 'learning_rate': [0.2], 'max_depth': [5], 
 'min_child_weight': [10], 'n_estimators': [700], 'objective': ['reg:linear'], 'reg_alpha': [1.3], 
 'reg_lambda': [1.2], 'silent': [1], 'subsample': [0.9]}

grid_search = GridSearchCV(XGBRegressor(),
                        parameters,
                        cv = 2,
                        n_jobs = 2,
                        verbose=True)

# Try fitting training data sets with all parameters
grid_search.fit(X_train,y_train)

# Print the best parameters
print(grid_search.best_params_)

#Fit the training tests using the best parameters
best_grid = XGBRegressor(**grid_search.best_params_)
best_grid.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = best_grid.predict(X_test)

# Since quality is integer, the float value will be rounded
for i in range(len(predictions)):
    predictions[i] = round(predictions[i])

# Output csv file
pd.DataFrame(predictions).to_csv('XGB.csv')

# Print the mean square error of the predicted and the real charges values
mse = mean_squared_error(predictions, y_test)
print(mse)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished


{'colsample_bytree': 1.0, 'gamma': 0.5, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 10, 'n_estimators': 700, 'objective': 'reg:linear', 'reg_alpha': 1.3, 'reg_lambda': 1.2, 'silent': 1, 'subsample': 0.9}
0.425


### Use RF + GridSearch to predict y

In [19]:
from sklearn.ensemble import RandomForestRegressor

# Create the parameter grid based on the results of random search 
parameters = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = parameters, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Try fitting training data sets with all parameters
grid_search.fit(X_train,y_train)

# Print the best parameters
print(grid_search.best_params_)

#Fit the training tests using the best parameters
best_grid = RandomForestRegressor(**grid_search.best_params_)
best_grid.fit(X_train,y_train)

# Get the predicted y
predictions = best_grid.predict(X_test)

# Since quality is integer, the float value will be rounded
for i in range(len(predictions)):
    predictions[i] = round(predictions[i])

# Output csv file
pd.DataFrame(predictions).to_csv('RF.csv')

# Print the mean square error of the predicted and the real charges values
mse = mean_squared_error(predictions, y_test)
print(mse)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.9min finished


{'bootstrap': True, 'max_depth': 80, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 1000}
0.44375
