In [32]:
wine_reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery,year,price_log
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87.0,19.0,Sicily & Sardinia,Etna,Unknown,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013,2.944439
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87.0,15.0,Douro,Unknown,Unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,2.70805
2,US,"Tart and snappy, the flavors of lime flesh and...",Unknown,87.0,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,2.639057
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87.0,13.0,Michigan,Lake Michigan Shore,Unknown,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,2.564949
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,4.174387


In [172]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(transform_output="pandas")

## Simple Mean

In [34]:
wr_work_1 = wine_reviews[['country','price','points']]

In [54]:
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(wr_work_1[['country','price']], wr_work_1.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)

In [59]:
y_tranformer = MinMaxScaler()
y_train_1 = y_tranformer.fit_transform(pd.DataFrame(y_train_1))
y_test_1 = y_tranformer.transform(pd.DataFrame(y_test_1))

In [61]:
x_train_1['mean_points'] = y_train_1.points.mean()
x_train_1['country_mean_points'] = x_train_1.join(y_train_1).groupby('country')['points'].transform('mean')
x_train_1.query('country == "Spain"').head()

Unnamed: 0,country,price,mean_points,country_mean_points
115022,Spain,12.0,0.422609,0.364758
109530,Spain,30.0,0.422609,0.364758
35307,Spain,10.0,0.422609,0.364758
44983,Spain,8.0,0.422609,0.364758
106790,Spain,40.0,0.422609,0.364758


In [64]:
country_mean = pd.DataFrame(x_train_1[['country', 'country_mean_points']].drop_duplicates())

x_test_1['mean_points'] = y_train_1.points.mean()
x_test_1 = x_test_1.merge(country_mean, on = 'country', how = 'left')
x_test_1.fillna(value = y_train_1.points.mean(), inplace = True)

In [41]:
results_df = pd.DataFrame(columns = ['method', 'variables', 'parameters', 'train_MSE', 'test_MSE'])

In [70]:
train_MSE_1 = mean_squared_error(y_train_1.points, x_train_1.mean_points)
train_MSE_1_country = mean_squared_error(y_train_1.points, x_train_1.country_mean_points)
test_MSE_1 = mean_squared_error(y_test_1.points, x_test_1.mean_points)
test_MSE_1_country = mean_squared_error(y_test_1.points, x_test_1.country_mean_points)
results_df.loc[len(results_df.index)] = (['simple mean', ['mean_points'], [], train_MSE_1, test_MSE_1])
results_df.loc[len(results_df.index)] = (['simple mean', ['country mean_points'], [], train_MSE_1_country, test_MSE_1_country])
results_df                                

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],[],0.023013,0.023359
1,simple mean,[country mean_points],[],0.021859,0.022136


## KNN

In [72]:
wr_work_2 = wine_reviews[['country','price_log','province','region_1','variety','winery','year','points']]
wr_work_2.head()

Unnamed: 0,country,price_log,province,region_1,variety,winery,year,points
0,Italy,2.944439,Sicily & Sardinia,Etna,White Blend,Nicosia,2013,87.0
1,Portugal,2.70805,Douro,Unknown,Portuguese Red,Quinta dos Avidagos,2011,87.0
2,US,2.639057,Oregon,Willamette Valley,Pinot Gris,Rainstorm,2013,87.0
3,US,2.564949,Michigan,Lake Michigan Shore,Riesling,St. Julian,2013,87.0
4,US,4.174387,Oregon,Willamette Valley,Pinot Noir,Sweet Cheeks,2012,87.0


In [77]:
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(wr_work_2.loc[:, wr_work_2.columns != 'points'], wr_work_2.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)
y_train_2 = y_tranformer.fit_transform(pd.DataFrame(y_train_2))
y_test_2 = y_tranformer.transform(pd.DataFrame(y_test_2))

In [108]:
categorical_cols = ['country', 'province', 'region_1', 'variety', 'winery', 'year']
numerical_cols = ['price_log']

x_tranformer = ColumnTransformer(
    transformers=[
        ('numerical', MinMaxScaler(), numerical_cols),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency = 100), categorical_cols),
], remainder='drop')

KNN_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('classifier', KNeighborsRegressor(n_jobs = -1))
])

In [109]:
x_tranformer

In [110]:
KNN_pipe

In [111]:
KNN_grid_search = GridSearchCV(KNN_pipe, param_grid = [{'classifier__n_neighbors': list(range(3, 32, 2))}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
KNN_grid_search

In [116]:
x_tranformer.fit_transform(x_train_2).shape

(97478, 371)

In [114]:
KNN_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END ........classifier__n_neighbors=3;, score=-0.015 total time=  14.1s
[CV 2/5] END ........classifier__n_neighbors=3;, score=-0.015 total time=  14.8s
[CV 3/5] END ........classifier__n_neighbors=3;, score=-0.015 total time=  16.0s
[CV 4/5] END ........classifier__n_neighbors=3;, score=-0.015 total time=  16.5s
[CV 5/5] END ........classifier__n_neighbors=3;, score=-0.015 total time=  15.5s
[CV 1/5] END ........classifier__n_neighbors=5;, score=-0.014 total time=  15.7s
[CV 2/5] END ........classifier__n_neighbors=5;, score=-0.014 total time=  16.4s
[CV 3/5] END ........classifier__n_neighbors=5;, score=-0.014 total time=  16.0s
[CV 4/5] END ........classifier__n_neighbors=5;, score=-0.014 total time=  15.7s
[CV 5/5] END ........classifier__n_neighbors=5;, score=-0.014 total time=  14.8s
[CV 1/5] END ........classifier__n_neighbors=7;, score=-0.014 total time=  15.0s
[CV 2/5] END ........classifier__n_neighbors=7;,

In [117]:
KNN_results_df = pd.DataFrame(KNN_grid_search.cv_results_)
KNN_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'classifier__n_neighbors': 3},-0.015077,0.000103,15
1,{'classifier__n_neighbors': 5},-0.013974,0.00011,14
2,{'classifier__n_neighbors': 7},-0.013584,0.000121,13
3,{'classifier__n_neighbors': 9},-0.013381,0.000118,12
4,{'classifier__n_neighbors': 11},-0.013271,0.000107,11
5,{'classifier__n_neighbors': 13},-0.01322,0.000116,10
6,{'classifier__n_neighbors': 15},-0.013201,0.000125,9
7,{'classifier__n_neighbors': 17},-0.013176,0.000134,8
8,{'classifier__n_neighbors': 19},-0.01315,0.00013,1
9,{'classifier__n_neighbors': 21},-0.013156,0.000137,4


In [120]:
KNN_params = KNN_results_df.loc[KNN_results_df.rank_test_score == 1, 'params'].values[0]
KNN_train_MSE = -KNN_results_df.loc[KNN_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
KNN_test_MSE = mean_squared_error(y_test_2.values, KNN_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['KNN', numerical_cols + categorical_cols, KNN_params, KNN_train_MSE, KNN_test_MSE])

In [121]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],[],0.023013,0.023359
1,simple mean,[country mean_points],[],0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'classifier__n_neighbors': 19},0.01315,0.013031


## Linear Regression

In [124]:
LR_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('classifier', LinearRegression(n_jobs = -1))
])
LR_pipe

In [133]:
LR_grid_search = GridSearchCV(LR_pipe, param_grid = [{'classifier__n_jobs': [-1]}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
LR_grid_search

In [134]:
LR_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END classifier__n_jobs=-1;, score=-1413039919103654144.000 total time=   2.7s
[CV 2/5] END classifier__n_jobs=-1;, score=-51558603937249353728.000 total time=   2.6s
[CV 3/5] END classifier__n_jobs=-1;, score=-148145972307456163840.000 total time=   2.6s
[CV 4/5] END classifier__n_jobs=-1;, score=-5430723145058345984.000 total time=   2.5s
[CV 5/5] END classifier__n_jobs=-1;, score=-434478472621165969408.000 total time=   2.4s


In [151]:
Lasso_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('classifier', Lasso())
])
Lasso_pipe

In [159]:
Lasso_grid_search = GridSearchCV(Lasso_pipe, param_grid = [{'classifier__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
Lasso_grid_search

In [160]:
Lasso_grid_search.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tranformer',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         MinMaxScaler(),
                                                                         ['price_log']),
                                                                        ('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       min_frequency=100,
                                                                                       sparse_output=False),
                                                                         ['country',
                                                                          'province',
                                   

In [161]:
Lasso_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=  31.2s
[CV 2/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=  31.2s
[CV 3/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=  25.9s
[CV 4/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=  36.2s
[CV 5/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=  36.7s
[CV 1/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   8.6s
[CV 2/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   8.7s
[CV 3/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   6.7s
[CV 4/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   9.4s
[CV 5/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   8.9s
[CV 1/5] END ..........classifier__alpha=0.001;, score=-0.014 total time=   1.3s
[CV 2/5] END ..........classifier__alpha=0.001;, 

In [162]:
Lasso_results_df = pd.DataFrame(Lasso_grid_search.cv_results_)
Lasso_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'classifier__alpha': 1e-05},-0.012823,7.1e-05,1
1,{'classifier__alpha': 0.0001},-0.013067,6.5e-05,2
2,{'classifier__alpha': 0.001},-0.01406,8.8e-05,3
3,{'classifier__alpha': 0.01},-0.023013,0.000174,4
4,{'classifier__alpha': 0.1},-0.023013,0.000174,4
5,{'classifier__alpha': 1},-0.023013,0.000174,4
6,{'classifier__alpha': 10},-0.023013,0.000174,4
7,{'classifier__alpha': 100},-0.023013,0.000174,4


In [163]:
Lasso_params = Lasso_results_df.loc[Lasso_results_df.rank_test_score == 1, 'params'].values[0]
Lasso_train_MSE = -Lasso_results_df.loc[Lasso_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
Lasso_test_MSE = mean_squared_error(y_test_2.values, Lasso_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['LR Lasso (L1)', numerical_cols + categorical_cols, Lasso_params, Lasso_train_MSE, Lasso_test_MSE])

In [164]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],[],0.023013,0.023359
1,simple mean,[country mean_points],[],0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'classifier__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, varie...",{'classifier__alpha': 1e-05},0.012823,0.012928


In [165]:
Ridge_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('classifier', Ridge())
])
Ridge_pipe

In [166]:
Ridge_grid_search = GridSearchCV(Ridge_pipe, param_grid = [{'classifier__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
Ridge_grid_search

In [167]:
Ridge_grid_search.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tranformer',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         MinMaxScaler(),
                                                                         ['price_log']),
                                                                        ('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       min_frequency=100,
                                                                                       sparse_output=False),
                                                                         ['country',
                                                                          'province',
                                   

In [168]:
Ridge_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=   1.4s
[CV 2/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=   1.4s
[CV 3/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=   1.3s
[CV 4/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=   1.3s
[CV 5/5] END ..........classifier__alpha=1e-05;, score=-0.013 total time=   1.2s
[CV 1/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   1.3s
[CV 2/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   1.2s
[CV 3/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   1.2s
[CV 4/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   1.6s
[CV 5/5] END .........classifier__alpha=0.0001;, score=-0.013 total time=   1.4s
[CV 1/5] END ..........classifier__alpha=0.001;, score=-0.013 total time=   1.4s
[CV 2/5] END ..........classifier__alpha=0.001;, 

In [169]:
Ridge_results_df = pd.DataFrame(Ridge_grid_search.cv_results_)
Ridge_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'classifier__alpha': 1e-05},-0.012853,7.2e-05,7
1,{'classifier__alpha': 0.0001},-0.012853,7.2e-05,6
2,{'classifier__alpha': 0.001},-0.012853,7.2e-05,5
3,{'classifier__alpha': 0.01},-0.012852,7.2e-05,4
4,{'classifier__alpha': 0.1},-0.012852,7.2e-05,2
5,{'classifier__alpha': 1},-0.012851,7.1e-05,1
6,{'classifier__alpha': 10},-0.012852,7.1e-05,3
7,{'classifier__alpha': 100},-0.013043,7.1e-05,8


In [170]:
Ridge_params = Ridge_results_df.loc[Ridge_results_df.rank_test_score == 1, 'params'].values[0]
Ridge_train_MSE = -Ridge_results_df.loc[Ridge_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
Ridge_test_MSE = mean_squared_error(y_test_2.values, Ridge_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['LR Ridge (L2)', numerical_cols + categorical_cols, Ridge_params, Ridge_train_MSE, Ridge_test_MSE])

In [171]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],[],0.023013,0.023359
1,simple mean,[country mean_points],[],0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'classifier__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, varie...",{'classifier__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, varie...",{'classifier__alpha': 1},0.012851,0.012933


## Random Forest

In [184]:
RF_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('regressor', RandomForestRegressor(n_jobs = -1,))
])
RF_pipe

In [202]:
RF_param_grid = [{
        'regressor__n_estimators': [100, 500, 1000],
        'regressor__max_depth': [2, 4, 8, 16, 32, 64], 
    }]

RF_grid_search = GridSearchCV(RF_pipe, param_grid = RF_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
RF_grid_search

In [203]:
RF_grid_search.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tranformer',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         MinMaxScaler(),
                                                                         ['price_log']),
                                                                        ('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       min_frequency=100,
                                                                                       sparse_output=False),
                                                                         ['country',
                                                                          'province',
                                   

In [204]:
RF_grid_search.fit(x_train_2 ,y_train_2.points)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=  12.9s
[CV 2/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   8.4s
[CV 3/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   9.0s
[CV 4/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   8.2s
[CV 5/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   8.4s
[CV 1/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  40.1s
[CV 2/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  38.6s
[CV 3/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  41.4s
[CV 4/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  39.7s
[CV 5/5] END regressor__max_depth=2, regressor__

In [209]:
RF_results_df = pd.DataFrame(RF_grid_search.cv_results_)
RF_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,"{'regressor__max_depth': 2, 'regressor__n_esti...",-0.015238,0.000118,16
1,"{'regressor__max_depth': 2, 'regressor__n_esti...",-0.015243,0.000109,18
2,"{'regressor__max_depth': 2, 'regressor__n_esti...",-0.01524,0.00011,17
3,"{'regressor__max_depth': 4, 'regressor__n_esti...",-0.014332,8.4e-05,15
4,"{'regressor__max_depth': 4, 'regressor__n_esti...",-0.014331,8e-05,14
5,"{'regressor__max_depth': 4, 'regressor__n_esti...",-0.014329,8.1e-05,13
6,"{'regressor__max_depth': 8, 'regressor__n_esti...",-0.013468,7.9e-05,12
7,"{'regressor__max_depth': 8, 'regressor__n_esti...",-0.013462,8.6e-05,10
8,"{'regressor__max_depth': 8, 'regressor__n_esti...",-0.013464,8.7e-05,11
9,"{'regressor__max_depth': 16, 'regressor__n_est...",-0.012406,8.9e-05,6


In [210]:
RF_params = RF_results_df.loc[RF_results_df.rank_test_score == 1, 'params'].values[0]
RF_train_MSE = -RF_results_df.loc[RF_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
RF_test_MSE = mean_squared_error(y_test_2.values, RF_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['Random Forest', numerical_cols + categorical_cols, RF_params, RF_train_MSE, RF_test_MSE])

In [211]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],[],0.023013,0.023359
1,simple mean,[country mean_points],[],0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'classifier__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, varie...",{'classifier__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, varie...",{'classifier__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, varie...","{'regressor__max_depth': 32, 'regressor__n_est...",0.012078,0.012056
