continued from here: https://colab.research.google.com/drive/1-P3yDuArVD4YBRraXKTzqLWwyKAoe-d8

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from IPython.display import display, HTML
from sklearn.tree import DecisionTreeRegressor

# Retrieve the data

In [0]:
red_wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
red_wine_df = pd.read_csv(red_wine_url,';')

white_wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
white_wine_df = pd.read_csv(white_wine_url,';')

combined_wine_df = red_wine_df.append(white_wine_df, ignore_index=True)

DFs = {
    'red_wine': red_wine_df,
    'white_wine': white_wine_df,
    'combined_data': combined_wine_df
}

In [0]:
print(red_wine_df.shape)
print(white_wine_df.shape)
print(combined_wine_df.shape)

(1599, 12)
(4898, 12)
(6497, 12)


# Build pipeline

## Create custom transformer

In [0]:
fixed_acidity_ix, citric_acidity_ix, free_sulfur_dioxide_ix, total_sulfur_dioxide_ix = 0, 2, 5, 6

class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_acidity_no_citric = True, add_unbound_sulfur_dioxide = True):
        self.add_acidity_no_citric = add_acidity_no_citric
        self.add_unbound_sulfur_dioxide = add_unbound_sulfur_dioxide
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.add_acidity_no_citric:
            acidity_no_citric = X[:,fixed_acidity_ix] - X[:,citric_acidity_ix]
            return np.c_[X, acidity_no_citric]
        elif self.add_unbound_sulfur_dioxide:
            unbound_sulfur_dioxide = X[:,total_sulfur_dioxide_ix] - X[:,free_sulfur_dioxide_ix]
            return np.c_[X, unbound_sulfur_dioxide]
        elif self.add_acidity_no_citric and self.add_unbound_sulfur_dioxide:
            acidity_no_citric = X[:,fixed_acidity_ix] - X[:,citric_acidity_ix]
            unbound_sulfur_dioxide = X[:,total_sulfur_dioxide_ix] - X[:,free_sulfur_dioxide_ix]
            return np.c_[X, acidity_no_citric, unbound_sulfur_dioxide]
        else:
            return X

## Create pipeline

In [0]:
data_prep_pipeline = Pipeline([
    ('attr_adder', AttributesAdder()),
    ('reduce_dim', PCA()),
    ('poly_feat', PolynomialFeatures())
])

# Start GridSearch

In [0]:
all_params = {
    'linear_regression': {
        'dataset': '',
        'ml_model': LinearRegression(),
        'param_grid': [
            {'attr_adder__add_acidity_no_citric': [True, False],
            'attr_adder__add_unbound_sulfur_dioxide': [True, False],
            'reduce_dim': ['passthrough', PCA(n_components=0.90, random_state=42), PCA(n_components=0.95, random_state=42)],
            'poly_feat': ['passthrough', PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)]
            }
        ],
        'ml_scoring': 'neg_mean_squared_error'
    },
    'decision_tree': {
        'dataset': '',
        'ml_model': DecisionTreeRegressor(),
        'param_grid': [
            {'attr_adder__add_acidity_no_citric': [True, False],
            'attr_adder__add_unbound_sulfur_dioxide': [True, False],
            'reduce_dim': ['passthrough', PCA(n_components=0.90, random_state=42), PCA(n_components=0.95, random_state=42)],
            'poly_feat': ['passthrough', PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)],
            'ml_model__random_state': [42]
            }
        ],
        'ml_scoring': 'neg_mean_squared_error'
    }
}

In [0]:
def eval_on_test_set(data_prep_pipeline, ml_model, test_dataset):
    X = test_dataset.drop('quality', axis='columns')
    y = test_dataset['quality']

    X_te_prepared = data_prep_pipeline.transform(X.values)
    final_predictions = ml_model.predict(X_te_prepared)

    final_mse = mean_squared_error(y, final_predictions)
    final_rmse = np.sqrt(final_mse)

    return final_rmse

In [0]:
def build_model_and_predict(all_params, df_key, param_key):
    # separate target values from attributes
    df = all_params['dataset']
    X = df.drop('quality', axis='columns')
    y = df['quality']

    # split data into training-test set
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    # print('----------------------------------')
    # print('X_tr:\n')
    # display(HTML(X_tr.head(3).to_html()))
    # print('')

    # build pipeline
    full_pipeline = Pipeline([
        ('attr_adder', AttributesAdder()),
        ('reduce_dim', PCA()),
        ('poly_feat', PolynomialFeatures()),
        ('ml_model', all_params['ml_model'])
    ])

    # print('----------------------------------')
    # print('full_pipeline:')
    # print(full_pipeline, '\n')

    # grid search
    # print('----------------------------------')
    # print('param_grid:')
    # print(all_params['param_grid'], '\n')

    grid_search = GridSearchCV(full_pipeline, param_grid=all_params['param_grid'], cv=5,
                               scoring=all_params['ml_scoring'],
                               return_train_score=True)
    grid_search = grid_search.fit(X_tr.values, y_tr)

    best_est = grid_search.best_estimator_
    # print('----------------------------------')
    # print('best_estimator_:')
    # print(best_est, '\n')

    best_params = grid_search.best_params_

    # final pipeline and model
    final_pipeline = Pipeline(best_est.steps[:-1])
    final_model = best_est.steps[-1][1]

    # evaluate on test set
    X_te_prepared = final_pipeline.transform(X_te.values)
    final_predictions = final_model.predict(X_te_prepared)

    final_mse = mean_squared_error(y_te, final_predictions)
    final_rmse = np.sqrt(final_mse)

    results = {
        'train_dataset': df_key,
        'ml_model': param_key,
        'test_dataset': 'X_te',
        'final_rmse': final_rmse,
        'best_estimator': best_est,
        'best_params': best_params
    }
    all_results = [results]

    # evaluate other test datasets
    for test_df_key in DFs:
        if df_key == 'combined_data': # do not test on either red/wine because combined_data has already been trained on part of the data
            continue
        
        if test_df_key != df_key:
            final_rmse = eval_on_test_set(final_pipeline, final_model, DFs[test_df_key])
            results = {
                'train_dataset': df_key,
                'ml_model': param_key,
                'test_dataset': test_df_key,
                'final_rmse': final_rmse,
                'best_estimator': best_est,
                'best_params': best_params
            }
            all_results.append(results)
        else:
            pass

    # print
    # for result in all_results:
    #     print('train_dataset:', result['train_dataset'])
    #     print('ml_model:', result['ml_model'])
    #     print('test_dataset:', result['test_dataset'])
    #     print('final_rmse:', result['final_rmse'])
    #     print('best_estimator:', result['best_estimator'])
    #     print('best_params:', result['best_params'])
    #     print('\n----------------------------------------------\n')

    return all_results

In [0]:
all_results_list = []

for df_key in DFs:
    for param_key in all_params:
        all_params[param_key]['dataset'] = DFs[df_key]

        print(df_key, '(', param_key, ')', ':')
        all_results_list.append(build_model_and_predict(all_params[param_key], df_key=df_key, param_key=param_key))
        print('\n===========================================================================================================')
        print('===========================================================================================================\n')

red_wine ( linear_regression ) :






red_wine ( decision_tree ) :


white_wine ( linear_regression ) :






white_wine ( decision_tree ) :


combined_data ( linear_regression ) :






combined_data ( decision_tree ) :




In [0]:
all_results_df = pd.DataFrame(columns=['best_estimator', 'final_rmse', 'ml_model', 'test_dataset', 'train_dataset'])

for i in range(0, len(all_results_list)):
    for j in range(0, len(all_results_list[i])):
        all_results_df = all_results_df.append(all_results_list[i][j], ignore_index=True)

for key in all_results_list[0][0]['best_params']:
    all_results_df[key] = all_results_df.apply(lambda x: x['best_params'][key], axis='columns')

all_results_df[['final_rmse','ml_model','test_dataset','train_dataset','attr_adder__add_acidity_no_citric','attr_adder__add_unbound_sulfur_dioxide','poly_feat','reduce_dim']].sort_values('final_rmse',ascending=True)

Unnamed: 0,final_rmse,ml_model,test_dataset,train_dataset,attr_adder__add_acidity_no_citric,attr_adder__add_unbound_sulfur_dioxide,poly_feat,reduce_dim
0,0.637464,linear_regression,X_te,red_wine,False,True,passthrough,passthrough
12,0.707064,linear_regression,X_te,combined_data,False,False,"PolynomialFeatures(degree=2, include_bias=True...",passthrough
6,0.73271,linear_regression,X_te,white_wine,True,True,"PolynomialFeatures(degree=2, include_bias=True...",passthrough
2,0.765868,linear_regression,combined_data,red_wine,False,True,passthrough,passthrough
3,0.772577,decision_tree,X_te,red_wine,False,True,passthrough,passthrough
1,0.801028,linear_regression,white_wine,red_wine,False,True,passthrough,passthrough
13,0.827415,decision_tree,X_te,combined_data,False,False,"PolynomialFeatures(degree=2, include_bias=True...",passthrough
11,0.845089,decision_tree,combined_data,white_wine,True,True,"PolynomialFeatures(degree=2, include_bias=True...",passthrough
9,0.846963,decision_tree,X_te,white_wine,True,True,"PolynomialFeatures(degree=2, include_bias=True...",passthrough
5,0.977191,decision_tree,combined_data,red_wine,False,True,passthrough,passthrough
