In [5]:
import pandas as pd
import numpy as np
import scipy

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(transform_output="pandas")

TypeError: set_config() got an unexpected keyword argument 'transform_output'

In [7]:
wine_reviews = pd.read_csv("wine_reviews.csv", index_col = 0)

In [30]:
wine_reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery,year,price_log
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87.0,19.0,Sicily & Sardinia,Etna,Unknown,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013,2.944439
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87.0,15.0,Douro,Unknown,Unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,2.70805
2,US,"Tart and snappy, the flavors of lime flesh and...",Unknown,87.0,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,2.639057
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87.0,13.0,Michigan,Lake Michigan Shore,Unknown,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,2.564949
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,4.174387


## Simple Mean

In [32]:
wr_work_1 = wine_reviews[['country','price','points']]

In [33]:
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(wr_work_1[['country','price']], wr_work_1.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)

In [34]:
y_tranformer = MinMaxScaler()
y_train_1 = y_tranformer.fit_transform(pd.DataFrame(y_train_1))
y_test_1 = y_tranformer.transform(pd.DataFrame(y_test_1))

In [35]:
x_train_1['mean_points'] = y_train_1.points.mean()
x_train_1['country_mean_points'] = x_train_1.join(y_train_1).groupby('country')['points'].transform('mean')
x_train_1.query('country == "Spain"').head()

Unnamed: 0,country,price,mean_points,country_mean_points
115022,Spain,12.0,0.422609,0.364758
109530,Spain,30.0,0.422609,0.364758
35307,Spain,10.0,0.422609,0.364758
44983,Spain,8.0,0.422609,0.364758
106790,Spain,40.0,0.422609,0.364758


In [36]:
country_mean = pd.DataFrame(x_train_1[['country', 'country_mean_points']].drop_duplicates())

x_test_1['mean_points'] = y_train_1.points.mean()
x_test_1 = x_test_1.merge(country_mean, on = 'country', how = 'left')
x_test_1.fillna(value = y_train_1.points.mean(), inplace = True)

In [37]:
results_df = pd.DataFrame(columns = ['method', 'variables', 'parameters', 'train_MSE', 'test_MSE'])

In [38]:
train_MSE_1 = mean_squared_error(y_train_1.points, x_train_1.mean_points)
train_MSE_1_country = mean_squared_error(y_train_1.points, x_train_1.country_mean_points)
test_MSE_1 = mean_squared_error(y_test_1.points, x_test_1.mean_points)
test_MSE_1_country = mean_squared_error(y_test_1.points, x_test_1.country_mean_points)
results_df.loc[len(results_df.index)] = (['simple mean', ['mean_points'], {}, train_MSE_1, test_MSE_1])
results_df.loc[len(results_df.index)] = (['simple mean', ['country mean_points'], {}, train_MSE_1_country, test_MSE_1_country])
results_df                                

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136


## KNN

In [39]:
wr_work_2 = wine_reviews[['country','price_log','province','region_1','variety','winery','year','points']]
wr_work_2.head()

Unnamed: 0,country,price_log,province,region_1,variety,winery,year,points
0,Italy,2.944439,Sicily & Sardinia,Etna,White Blend,Nicosia,2013,87.0
1,Portugal,2.70805,Douro,Unknown,Portuguese Red,Quinta dos Avidagos,2011,87.0
2,US,2.639057,Oregon,Willamette Valley,Pinot Gris,Rainstorm,2013,87.0
3,US,2.564949,Michigan,Lake Michigan Shore,Riesling,St. Julian,2013,87.0
4,US,4.174387,Oregon,Willamette Valley,Pinot Noir,Sweet Cheeks,2012,87.0


In [40]:
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(wr_work_2.loc[:, wr_work_2.columns != 'points'], wr_work_2.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)
y_train_2 = y_tranformer.fit_transform(pd.DataFrame(y_train_2))
y_test_2 = y_tranformer.transform(pd.DataFrame(y_test_2))

In [46]:
categorical_cols = ['country', 'province', 'region_1', 'variety', 'winery', 'year']
numerical_cols = ['price_log']

x_tranformer = ColumnTransformer(
    transformers=[
        ('numerical', MinMaxScaler(), numerical_cols),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency = 100), categorical_cols),
], remainder='drop')

KNN_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('regressor', KNeighborsRegressor(n_jobs = -1))
])

In [47]:
KNN_pipe

In [48]:
KNN_grid_search = GridSearchCV(KNN_pipe, param_grid = [{'regressor__n_neighbors': list(range(3, 32, 2))}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
KNN_grid_search

In [49]:
x_tranformer.fit_transform(x_train_2).shape

(97478, 371)

In [50]:
KNN_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END .........regressor__n_neighbors=3;, score=-0.015 total time=  14.8s
[CV 2/5] END .........regressor__n_neighbors=3;, score=-0.015 total time=  14.5s
[CV 3/5] END .........regressor__n_neighbors=3;, score=-0.015 total time=  14.8s
[CV 4/5] END .........regressor__n_neighbors=3;, score=-0.015 total time=  14.6s
[CV 5/5] END .........regressor__n_neighbors=3;, score=-0.015 total time=  14.4s
[CV 1/5] END .........regressor__n_neighbors=5;, score=-0.014 total time=  16.5s
[CV 2/5] END .........regressor__n_neighbors=5;, score=-0.014 total time=  16.5s
[CV 3/5] END .........regressor__n_neighbors=5;, score=-0.014 total time=  16.1s
[CV 4/5] END .........regressor__n_neighbors=5;, score=-0.014 total time=  16.3s
[CV 5/5] END .........regressor__n_neighbors=5;, score=-0.014 total time=  15.9s
[CV 1/5] END .........regressor__n_neighbors=7;, score=-0.014 total time=  15.2s
[CV 2/5] END .........regressor__n_neighbors=7;,

In [51]:
KNN_results_df = pd.DataFrame(KNN_grid_search.cv_results_)
KNN_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'regressor__n_neighbors': 3},-0.015077,0.000103,15
1,{'regressor__n_neighbors': 5},-0.013974,0.00011,14
2,{'regressor__n_neighbors': 7},-0.013584,0.000121,13
3,{'regressor__n_neighbors': 9},-0.013381,0.000118,12
4,{'regressor__n_neighbors': 11},-0.013271,0.000107,11
5,{'regressor__n_neighbors': 13},-0.01322,0.000116,10
6,{'regressor__n_neighbors': 15},-0.013201,0.000125,9
7,{'regressor__n_neighbors': 17},-0.013176,0.000134,8
8,{'regressor__n_neighbors': 19},-0.01315,0.00013,1
9,{'regressor__n_neighbors': 21},-0.013156,0.000137,4


In [52]:
KNN_params = KNN_results_df.loc[KNN_results_df.rank_test_score == 1, 'params'].values[0]
KNN_train_MSE = -KNN_results_df.loc[KNN_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
KNN_test_MSE = mean_squared_error(y_test_2.values, KNN_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['KNN', numerical_cols + categorical_cols, KNN_params, KNN_train_MSE, KNN_test_MSE])

In [53]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'regressor__n_neighbors': 19},0.01315,0.013031


## Linear Regression

In [54]:
LR_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('regressor', LinearRegression(n_jobs = -1))
])
LR_pipe

In [55]:
LR_grid_search = GridSearchCV(LR_pipe, param_grid = [{'regressor__n_jobs': [1000, 10000, -1]}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
LR_grid_search

In [56]:
LR_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END regressor__n_jobs=1000;, score=-1413039919103654144.000 total time=   3.8s
[CV 2/5] END regressor__n_jobs=1000;, score=-51558603937249353728.000 total time=   2.9s
[CV 3/5] END regressor__n_jobs=1000;, score=-148145972307456163840.000 total time=   3.0s
[CV 4/5] END regressor__n_jobs=1000;, score=-5430723145058345984.000 total time=   2.8s
[CV 5/5] END regressor__n_jobs=1000;, score=-434478472621165969408.000 total time=   3.0s
[CV 1/5] END regressor__n_jobs=10000;, score=-1413039919103654144.000 total time=   2.8s
[CV 2/5] END regressor__n_jobs=10000;, score=-51558603937249353728.000 total time=   2.7s
[CV 3/5] END regressor__n_jobs=10000;, score=-148145972307456163840.000 total time=   2.6s
[CV 4/5] END regressor__n_jobs=10000;, score=-5430723145058345984.000 total time=   2.7s
[CV 5/5] END regressor__n_jobs=10000;, score=-434478472621165969408.000 total time=   3.1s
[CV 1/5] END regressor__n_jobs=-1;, score=-14

In [57]:
Lasso_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('regressor', Lasso())
])
Lasso_pipe

In [58]:
Lasso_grid_search = GridSearchCV(Lasso_pipe, param_grid = [{'regressor__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
Lasso_grid_search

In [59]:
Lasso_grid_search.get_params()

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tranformer',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         MinMaxScaler(),
                                                                         ['price_log']),
                                                                        ('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       min_frequency=100,
                                                                                       sparse_output=False),
                                                                         ['country',
                                                                          'province',
                                   

In [60]:
Lasso_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=  42.6s
[CV 2/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=  40.8s
[CV 3/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=  32.5s
[CV 4/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=  45.6s
[CV 5/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=  49.9s
[CV 1/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=  13.0s
[CV 2/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=  12.6s
[CV 3/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=   8.5s
[CV 4/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=  12.1s
[CV 5/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=  11.6s
[CV 1/5] END ...........regressor__alpha=0.001;, score=-0.014 total time=   1.7s
[CV 2/5] END ...........regressor__alpha=0.001;, 

In [61]:
Lasso_results_df = pd.DataFrame(Lasso_grid_search.cv_results_)
Lasso_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'regressor__alpha': 1e-05},-0.012823,7.1e-05,1
1,{'regressor__alpha': 0.0001},-0.013067,6.5e-05,2
2,{'regressor__alpha': 0.001},-0.01406,8.8e-05,3
3,{'regressor__alpha': 0.01},-0.023013,0.000174,4
4,{'regressor__alpha': 0.1},-0.023013,0.000174,4
5,{'regressor__alpha': 1},-0.023013,0.000174,4
6,{'regressor__alpha': 10},-0.023013,0.000174,4
7,{'regressor__alpha': 100},-0.023013,0.000174,4


In [62]:
Lasso_params = Lasso_results_df.loc[Lasso_results_df.rank_test_score == 1, 'params'].values[0]
Lasso_train_MSE = -Lasso_results_df.loc[Lasso_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
Lasso_test_MSE = mean_squared_error(y_test_2.values, Lasso_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['LR Lasso (L1)', numerical_cols + categorical_cols, Lasso_params, Lasso_train_MSE, Lasso_test_MSE])

In [63]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, varie...",{'regressor__alpha': 1e-05},0.012823,0.012928


In [64]:
Ridge_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('regressor', Ridge())
])
Ridge_pipe

In [65]:
Ridge_grid_search = GridSearchCV(Ridge_pipe, param_grid = [{'regressor__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}], cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
Ridge_grid_search

In [66]:
Ridge_grid_search.get_params()

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tranformer',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         MinMaxScaler(),
                                                                         ['price_log']),
                                                                        ('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       min_frequency=100,
                                                                                       sparse_output=False),
                                                                         ['country',
                                                                          'province',
                                   

In [67]:
Ridge_grid_search.fit(x_train_2 ,y_train_2)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=   1.6s
[CV 2/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=   1.6s
[CV 3/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=   1.4s
[CV 4/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=   1.4s
[CV 5/5] END ...........regressor__alpha=1e-05;, score=-0.013 total time=   1.6s
[CV 1/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=   1.9s
[CV 2/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=   1.6s
[CV 3/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=   1.6s
[CV 4/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=   1.5s
[CV 5/5] END ..........regressor__alpha=0.0001;, score=-0.013 total time=   1.4s
[CV 1/5] END ...........regressor__alpha=0.001;, score=-0.013 total time=   1.5s
[CV 2/5] END ...........regressor__alpha=0.001;, 

In [68]:
Ridge_results_df = pd.DataFrame(Ridge_grid_search.cv_results_)
Ridge_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'regressor__alpha': 1e-05},-0.012853,7.2e-05,7
1,{'regressor__alpha': 0.0001},-0.012853,7.2e-05,6
2,{'regressor__alpha': 0.001},-0.012853,7.2e-05,5
3,{'regressor__alpha': 0.01},-0.012852,7.2e-05,4
4,{'regressor__alpha': 0.1},-0.012852,7.2e-05,2
5,{'regressor__alpha': 1},-0.012851,7.1e-05,1
6,{'regressor__alpha': 10},-0.012852,7.1e-05,3
7,{'regressor__alpha': 100},-0.013043,7.1e-05,8


In [69]:
Ridge_params = Ridge_results_df.loc[Ridge_results_df.rank_test_score == 1, 'params'].values[0]
Ridge_train_MSE = -Ridge_results_df.loc[Ridge_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
Ridge_test_MSE = mean_squared_error(y_test_2.values, Ridge_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['LR Ridge (L2)', numerical_cols + categorical_cols, Ridge_params, Ridge_train_MSE, Ridge_test_MSE])

In [70]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, varie...",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, varie...",{'regressor__alpha': 1},0.012851,0.012933


## Random Forest

In [71]:
RF_pipe = Pipeline([
    ('tranformer', x_tranformer),
    ('regressor', RandomForestRegressor(n_jobs = -1,))
])
RF_pipe

In [72]:
RF_param_grid = [{
        'regressor__n_estimators': [100, 500, 1000],
        'regressor__max_depth': [2, 4, 8, 16, 32, 64], 
    }]

RF_grid_search = GridSearchCV(RF_pipe, param_grid = RF_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
RF_grid_search

In [73]:
RF_grid_search.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('tranformer',
   ColumnTransformer(transformers=[('numerical', MinMaxScaler(), ['price_log']),
                                   ('categorical',
                                    OneHotEncoder(handle_unknown='ignore',
                                                  min_frequency=100,
                                                  sparse_output=False),
                                    ['country', 'province', 'region_1', 'variety',
                                     'winery', 'year'])])),
  ('regressor', RandomForestRegressor(n_jobs=-1))],
 'estimator__verbose': False,
 'estimator__tranformer': ColumnTransformer(transformers=[('numerical', MinMaxScaler(), ['price_log']),
                                 ('categorical',
                                  OneHotEncoder(handle_unknown='ignore',
                                                min_frequency=100,
                                      

In [74]:
RF_grid_search.fit(x_train_2 ,y_train_2.points)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=  11.8s
[CV 2/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   8.5s
[CV 3/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   9.3s
[CV 4/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   8.6s
[CV 5/5] END regressor__max_depth=2, regressor__n_estimators=100;, score=-0.015 total time=   9.2s
[CV 1/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  40.3s
[CV 2/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  39.2s
[CV 3/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  39.1s
[CV 4/5] END regressor__max_depth=2, regressor__n_estimators=500;, score=-0.015 total time=  41.5s
[CV 5/5] END regressor__max_depth=2, regressor__

In [112]:
RF_results_df = pd.DataFrame(RF_grid_search.cv_results_)
RF_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,"{'regressor__max_depth': 2, 'regressor__n_estimators': 100}",-0.01524,0.00011,18
1,"{'regressor__max_depth': 2, 'regressor__n_estimators': 500}",-0.01524,0.000111,16
2,"{'regressor__max_depth': 2, 'regressor__n_estimators': 1000}",-0.01524,0.000109,17
3,"{'regressor__max_depth': 4, 'regressor__n_estimators': 100}",-0.01433,8e-05,15
4,"{'regressor__max_depth': 4, 'regressor__n_estimators': 500}",-0.01433,7.9e-05,14
5,"{'regressor__max_depth': 4, 'regressor__n_estimators': 1000}",-0.014329,8.1e-05,13
6,"{'regressor__max_depth': 8, 'regressor__n_estimators': 100}",-0.013467,8.5e-05,12
7,"{'regressor__max_depth': 8, 'regressor__n_estimators': 500}",-0.013464,8.4e-05,10
8,"{'regressor__max_depth': 8, 'regressor__n_estimators': 1000}",-0.013465,8.5e-05,11
9,"{'regressor__max_depth': 16, 'regressor__n_estimators': 100}",-0.012401,8.8e-05,6


In [88]:
RF_params = RF_results_df.loc[RF_results_df.rank_test_score == 1, 'params'].values[0]
RF_train_MSE = -RF_results_df.loc[RF_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
RF_test_MSE = mean_squared_error(y_test_2.values, RF_grid_search.predict(x_test_2))

results_df.loc[len(results_df.index)] = (['Random Forest', numerical_cols + categorical_cols, RF_params, RF_train_MSE, RF_test_MSE])

In [89]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, varie...",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, varie...",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, varie...",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, varie...","{'regressor__max_depth': 32, 'regressor__n_est...",0.012073,0.012055


## NLP - Bag of Words with Linear Regression

In [90]:
wr_work_3 = wine_reviews[['description','points']]
wr_work_3.head()

Unnamed: 0,description,points
0,"Aromas include tropical fruit, broom, brimston...",87.0
1,"This is ripe and fruity, a wine that is smooth...",87.0
2,"Tart and snappy, the flavors of lime flesh and...",87.0
3,"Pineapple rind, lemon pith and orange blossom ...",87.0
4,"Much like the regular bottling from 2012, this...",87.0


In [91]:
x_train_3, x_test_3, y_train_3, y_test_3 = train_test_split(wr_work_3.description, wr_work_2.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)
y_train_3 = y_tranformer.fit_transform(pd.DataFrame(y_train_3))
y_test_3 = y_tranformer.transform(pd.DataFrame(y_test_3))

In [92]:
BOW_pipe = Pipeline([
    ('vectorizer', CountVectorizer(stop_words = 'english')),
    ('regressor', ['passthrough'])
])
BOW_pipe

In [93]:
BOW_param_grid = [
    {
        'vectorizer__max_features': [300, 600, 1000, 2000, 5000, None],
        'regressor': [Lasso()], 
        'regressor__alpha': [0.00001, 0.0001]
    },
    {
        'vectorizer__max_features': [300, 600, 1000, 2000, 5000, None],
        'regressor': [Ridge()], 
        'regressor__alpha': [0.1, 1, 10, 100]
    }]


BOW_grid_search = GridSearchCV(BOW_pipe, param_grid = BOW_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
BOW_grid_search

In [94]:
BOW_grid_search.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('vectorizer', CountVectorizer(stop_words='english')),
  ('regressor', ['passthrough'])],
 'estimator__verbose': False,
 'estimator__vectorizer': CountVectorizer(stop_words='english'),
 'estimator__regressor': ['passthrough'],
 'estimator__vectorizer__analyzer': 'word',
 'estimator__vectorizer__binary': False,
 'estimator__vectorizer__decode_error': 'strict',
 'estimator__vectorizer__dtype': numpy.int64,
 'estimator__vectorizer__encoding': 'utf-8',
 'estimator__vectorizer__input': 'content',
 'estimator__vectorizer__lowercase': True,
 'estimator__vectorizer__max_df': 1.0,
 'estimator__vectorizer__max_features': None,
 'estimator__vectorizer__min_df': 1,
 'estimator__vectorizer__ngram_range': (1, 1),
 'estimator__vectorizer__preprocessor': None,
 'estimator__vectorizer__stop_words': 'english',
 'estimator__vectorizer__strip_accents': None,
 'estimator__vectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'esti

In [95]:
BOW_grid_search.fit(x_train_3 ,y_train_3.points)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.9s
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.8s
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.8s
[CV 4/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.7s
[CV 5/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.8s
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=600;, score=-0.010 total time=  10.4s
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=600;, score=-0.010 total time=  10.9s
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=600;, score=-0.010 total

In [96]:
pd.set_option('display.max_colwidth', None)

In [97]:
BOW_results_df = pd.DataFrame(BOW_grid_search.cv_results_)
BOW_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
29,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'vectorizer__max_features': None}",-0.007156,6.4e-05,1
28,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'vectorizer__max_features': 5000}",-0.007374,5.9e-05,2
5,"{'regressor': Lasso(), 'regressor__alpha': 1e-05, 'vectorizer__max_features': None}",-0.007445,6e-05,3
22,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 1, 'vectorizer__max_features': 5000}",-0.007469,6.3e-05,4
4,"{'regressor': Lasso(), 'regressor__alpha': 1e-05, 'vectorizer__max_features': 5000}",-0.007481,5.7e-05,5
16,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",-0.00749,6.4e-05,6
35,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 100, 'vectorizer__max_features': None}",-0.007526,6.6e-05,7
34,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 100, 'vectorizer__max_features': 5000}",-0.007611,6.5e-05,8
23,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 1, 'vectorizer__max_features': None}",-0.007681,6.8e-05,9
27,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'vectorizer__max_features': 2000}",-0.008028,7.7e-05,10


In [98]:
BOW_params = BOW_results_df.loc[BOW_results_df.rank_test_score == 1, 'params'].values[0]
BOW_train_MSE = -BOW_results_df.loc[BOW_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
BOW_test_MSE = mean_squared_error(y_test_3.values, BOW_grid_search.predict(x_test_3))

results_df.loc[len(results_df.index)] = (['NLP Bag of Words + LR', 'description', BOW_params, BOW_train_MSE, BOW_test_MSE])

In [99]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, variety, winery, year]",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, variety, winery, year]","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156


## NLP - TF IDF with Linear Regression

In [100]:
TFIDF_pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words = 'english')),
    ('regressor', ['passthrough'])
])
TFIDF_pipe

In [101]:
TFIDF_param_grid = BOW_param_grid

TFIDF_grid_search = GridSearchCV(TFIDF_pipe, param_grid = TFIDF_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
TFIDF_grid_search

In [102]:
TFIDF_grid_search.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('vectorizer', TfidfVectorizer(stop_words='english')),
  ('regressor', ['passthrough'])],
 'estimator__verbose': False,
 'estimator__vectorizer': TfidfVectorizer(stop_words='english'),
 'estimator__regressor': ['passthrough'],
 'estimator__vectorizer__analyzer': 'word',
 'estimator__vectorizer__binary': False,
 'estimator__vectorizer__decode_error': 'strict',
 'estimator__vectorizer__dtype': numpy.float64,
 'estimator__vectorizer__encoding': 'utf-8',
 'estimator__vectorizer__input': 'content',
 'estimator__vectorizer__lowercase': True,
 'estimator__vectorizer__max_df': 1.0,
 'estimator__vectorizer__max_features': None,
 'estimator__vectorizer__min_df': 1,
 'estimator__vectorizer__ngram_range': (1, 1),
 'estimator__vectorizer__norm': 'l2',
 'estimator__vectorizer__preprocessor': None,
 'estimator__vectorizer__smooth_idf': True,
 'estimator__vectorizer__stop_words': 'english',
 'estimator__vectorizer__strip_

In [103]:
TFIDF_grid_search.fit(x_train_3 ,y_train_3.points)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.6s
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.011 total time=   4.8s
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.011 total time=   4.9s
[CV 4/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.6s
[CV 5/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=300;, score=-0.012 total time=   4.8s
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=600;, score=-0.010 total time=  10.2s
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=600;, score=-0.010 total time=  11.1s
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, vectorizer__max_features=600;, score=-0.010 total

In [104]:
TFIDF_results_df = pd.DataFrame(TFIDF_grid_search.cv_results_)
TFIDF_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
16,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",-0.007156,7.6e-05,1
22,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 1, 'vectorizer__max_features': 5000}",-0.007173,6.9e-05,2
23,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 1, 'vectorizer__max_features': None}",-0.007289,6.6e-05,3
17,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': None}",-0.007506,7.7e-05,4
15,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 2000}",-0.007723,7.1e-05,5
21,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 1, 'vectorizer__max_features': 2000}",-0.007731,7.1e-05,6
28,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': 5000}",-0.008122,5.8e-05,7
27,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': 2000}",-0.008256,6.6e-05,8
29,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",-0.00841,5.5e-05,9
3,"{'regressor': Lasso(), 'regressor__alpha': 1e-05, 'vectorizer__max_features': 2000}",-0.008415,6.7e-05,10


In [105]:
TFIDF_params = TFIDF_results_df.loc[TFIDF_results_df.rank_test_score == 1, 'params'].values[0]
TFIDF_train_MSE = -TFIDF_results_df.loc[TFIDF_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
TFIDF_test_MSE = mean_squared_error(y_test_3.values, TFIDF_grid_search.predict(x_test_3))

results_df.loc[len(results_df.index)] = (['NLP TFIDF + LR', 'description', TFIDF_params, TFIDF_train_MSE, TFIDF_test_MSE])

In [106]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, variety, winery, year]",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, variety, winery, year]","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166


Not as expected, Bag of Words and TF-IDF has similar train MSE, and Bag of Word has better test MSE score!

## NLP with Random Forest

In [140]:
NLP_RF_pipe = Pipeline([
    ('vectorizer', ['passthrough']),
    ('regressor', RandomForestRegressor(n_jobs = -1, max_samples = 0.1))
])
NLP_RF_pipe

In [144]:
NLP_RF_param_grid = [{
        'vectorizer': [CountVectorizer(stop_words = 'english'), TfidfVectorizer(stop_words = 'english')],
        'vectorizer__max_features': [1000, 5000],
        'regressor__n_estimators': [100, 500],
        'regressor__max_depth': [16, 32],  
    }]

NLP_RF_grid_search = GridSearchCV(NLP_RF_pipe, param_grid = NLP_RF_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
NLP_RF_grid_search

In [142]:
NLP_RF_grid_search.get_params()

{'cv': 2,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('vectorizer', ['passthrough']),
  ('regressor', RandomForestRegressor(max_samples=0.1, n_jobs=-1))],
 'estimator__verbose': False,
 'estimator__vectorizer': ['passthrough'],
 'estimator__regressor': RandomForestRegressor(max_samples=0.1, n_jobs=-1),
 'estimator__regressor__bootstrap': True,
 'estimator__regressor__ccp_alpha': 0.0,
 'estimator__regressor__criterion': 'squared_error',
 'estimator__regressor__max_depth': None,
 'estimator__regressor__max_features': 1.0,
 'estimator__regressor__max_leaf_nodes': None,
 'estimator__regressor__max_samples': 0.1,
 'estimator__regressor__min_impurity_decrease': 0.0,
 'estimator__regressor__min_samples_leaf': 1,
 'estimator__regressor__min_samples_split': 2,
 'estimator__regressor__min_weight_fraction_leaf': 0.0,
 'estimator__regressor__n_estimators': 100,
 'estimator__regressor__n_jobs': -1,
 'estimator__regressor__oob_score': False,
 'estimator__regressor__random

In [145]:
NLP_RF_grid_search.fit(x_train_3 ,y_train_3.points)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END regressor__max_depth=16, regressor__n_estimators=100, vectorizer=CountVectorizer(stop_words='english'), vectorizer__max_features=1000;, score=-0.015 total time=  43.4s
[CV 2/5] END regressor__max_depth=16, regressor__n_estimators=100, vectorizer=CountVectorizer(stop_words='english'), vectorizer__max_features=1000;, score=-0.015 total time=  42.0s
[CV 3/5] END regressor__max_depth=16, regressor__n_estimators=100, vectorizer=CountVectorizer(stop_words='english'), vectorizer__max_features=1000;, score=-0.016 total time=  39.6s
[CV 4/5] END regressor__max_depth=16, regressor__n_estimators=100, vectorizer=CountVectorizer(stop_words='english'), vectorizer__max_features=1000;, score=-0.016 total time=  40.3s
[CV 5/5] END regressor__max_depth=16, regressor__n_estimators=100, vectorizer=CountVectorizer(stop_words='english'), vectorizer__max_features=1000;, score=-0.015 total time=  40.4s
[CV 1/5] END regressor__max_depth=

In [146]:
NLP_RF_results_df = pd.DataFrame(NLP_RF_grid_search.cv_results_)
NLP_RF_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
15,"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",-0.012006,5.7e-05,1
11,"{'regressor__max_depth': 32, 'regressor__n_estimators': 100, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",-0.012053,6.1e-05,2
14,"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 1000}",-0.01227,9.3e-05,3
10,"{'regressor__max_depth': 32, 'regressor__n_estimators': 100, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 1000}",-0.012355,8.4e-05,4
12,"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': CountVectorizer(stop_words='english'), 'vectorizer__max_features': 1000}",-0.013009,9.2e-05,5
8,"{'regressor__max_depth': 32, 'regressor__n_estimators': 100, 'vectorizer': CountVectorizer(stop_words='english'), 'vectorizer__max_features': 1000}",-0.013073,9.5e-05,6
13,"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': CountVectorizer(stop_words='english'), 'vectorizer__max_features': 5000}",-0.013103,0.000104,7
9,"{'regressor__max_depth': 32, 'regressor__n_estimators': 100, 'vectorizer': CountVectorizer(stop_words='english'), 'vectorizer__max_features': 5000}",-0.013164,0.00011,8
7,"{'regressor__max_depth': 16, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",-0.013891,8.2e-05,9
3,"{'regressor__max_depth': 16, 'regressor__n_estimators': 100, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",-0.013929,5.8e-05,10


In [147]:
NLP_RF_params = NLP_RF_results_df.loc[NLP_RF_results_df.rank_test_score == 1, 'params'].values[0]
NLP_RF_train_MSE = -NLP_RF_results_df.loc[NLP_RF_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
NLP_RF_test_MSE = mean_squared_error(y_test_3.values, NLP_RF_grid_search.predict(x_test_3))

results_df.loc[len(results_df.index)] = (['Random Forest', ['description'], NLP_RF_params, NLP_RF_train_MSE, NLP_RF_test_MSE])

In [148]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, variety, winery, year]",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, variety, winery, year]","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,[description],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018


## All variables Bag of Words with Linear Regression

In [166]:
wr_work_4 = wine_reviews[['country','price_log','province','region_1','variety','winery','year','description','points']]
wr_work_4.head()

Unnamed: 0,country,price_log,province,region_1,variety,winery,year,description,points
0,Italy,2.944439,Sicily & Sardinia,Etna,White Blend,Nicosia,2013,"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",87.0
1,Portugal,2.70805,Douro,Unknown,Portuguese Red,Quinta dos Avidagos,2011,"This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.",87.0
2,US,2.639057,Oregon,Willamette Valley,Pinot Gris,Rainstorm,2013,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.",87.0
3,US,2.564949,Michigan,Lake Michigan Shore,Riesling,St. Julian,2013,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.",87.0
4,US,4.174387,Oregon,Willamette Valley,Pinot Noir,Sweet Cheeks,2012,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.",87.0


In [205]:
x_train_4, x_test_4, y_train_4, y_test_4 = train_test_split(wr_work_4.loc[:, wr_work_4.columns != 'points'], wr_work_4.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)
y_train_4 = y_tranformer.fit_transform(pd.DataFrame(y_train_4))
y_test_4 = y_tranformer.transform(pd.DataFrame(y_test_4))

In [247]:
x_vectorizer_BOW = Pipeline([
    ('squeezer', FunctionTransformer(lambda i: i.squeeze())),
    ('vectorizer', CountVectorizer(stop_words = 'english')),
    ('convertor', FunctionTransformer(lambda i: i.toarray())),
])

x_tranformer_BOW = ColumnTransformer(
    transformers=[
        ('numerical', MinMaxScaler(), numerical_cols),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency = 100), categorical_cols),
        ('text', x_vectorizer_BOW, ['description']),
], remainder='drop')

x_tranformer_BOW

In [256]:
ALL_BOW_LR_pipe = Pipeline([
    ('transformer', x_tranformer_BOW),
    ('regressor',['passthrough'])
])
ALL_BOW_LR_pipe

In [260]:
ALL_BOW_LR_param_grid = [
    {
        'transformer__text__vectorizer__max_features': [300, 600, 1000, 2000, 5000],
        'regressor': [Lasso()], 
        'regressor__alpha': [0.00001, 0.0001]
    },
    {
        'transformer__text__vectorizer__max_features': [300, 600, 1000, 2000, 5000],
        'regressor': [Ridge()], 
        'regressor__alpha': [0.1, 1, 10, 100]
    }]

ALL_BOW_LR_grid_search = GridSearchCV(ALL_BOW_LR_pipe, param_grid = ALL_BOW_LR_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
ALL_BOW_LR_grid_search

In [261]:
ALL_BOW_LR_grid_search.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('transformer',
   ColumnTransformer(transformers=[('numerical', MinMaxScaler(), ['price_log']),
                                   ('categorical',
                                    OneHotEncoder(handle_unknown='ignore',
                                                  min_frequency=100,
                                                  sparse_output=False),
                                    ['country', 'province', 'region_1', 'variety',
                                     'winery', 'year']),
                                   ('text',
                                    Pipeline(steps=[('squeezer',
                                                     FunctionTransformer(func=<function <lambda> at 0x0000021E8AB34DC0>)),
                                                    ('vectorizer',
                                                     CountVectorizer(stop_words='english')),
                       

In [262]:
ALL_BOW_LR_grid_search.fit(x_train_4 ,y_train_4.points)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time=  53.5s



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.455e-01, tolerance: 1.796e-01



[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time= 1.9min
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time= 1.1min
[CV 4/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time= 1.6min
[CV 5/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time=  42.4s



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.028e-01, tolerance: 1.797e-01



[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, score=-0.007 total time= 2.7min



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.615e-01, tolerance: 1.796e-01



[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, score=-0.007 total time= 2.6min
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, score=-0.007 total time= 1.2min
[CV 4/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, score=-0.007 total time= 2.3min
[CV 5/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, score=-0.007 total time= 1.4min
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=1000;, score=-0.007 total time= 1.9min



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.724e-01, tolerance: 1.796e-01



[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=1000;, score=-0.007 total time= 3.8min
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=1000;, score=-0.006 total time= 2.4min
[CV 4/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=1000;, score=-0.007 total time= 2.2min
[CV 5/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=1000;, score=-0.007 total time= 2.8min
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=2000;, score=-0.006 total time= 2.7min
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=2000;, score=-0.006 total time=47.7min
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=2000;, score=-0.006 total time= 4.7min
[CV 4/5] END regressor=Lass

In [263]:
ALL_BOW_LR_results_df = pd.DataFrame(ALL_BOW_LR_grid_search.cv_results_)
ALL_BOW_LR_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
24,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",-0.005578,5.6e-05,1
19,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 1, 'transformer__text__vectorizer__max_features': 5000}",-0.005654,6e-05,2
4,"{'regressor': Lasso(), 'regressor__alpha': 1e-05, 'transformer__text__vectorizer__max_features': 5000}",-0.005661,5.2e-05,3
14,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 0.1, 'transformer__text__vectorizer__max_features': 5000}",-0.005671,6.1e-05,4
29,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 100, 'transformer__text__vectorizer__max_features': 5000}",-0.005765,5.1e-05,5
23,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 2000}",-0.005976,6.1e-05,6
3,"{'regressor': Lasso(), 'regressor__alpha': 1e-05, 'transformer__text__vectorizer__max_features': 2000}",-0.005976,5.7e-05,7
18,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 1, 'transformer__text__vectorizer__max_features': 2000}",-0.005986,6.1e-05,8
13,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 0.1, 'transformer__text__vectorizer__max_features': 2000}",-0.005989,6.2e-05,9
28,"{'regressor': Ridge(alpha=10), 'regressor__alpha': 100, 'transformer__text__vectorizer__max_features': 2000}",-0.006085,5.9e-05,10


In [264]:
ALL_BOW_LR_params = ALL_BOW_LR_results_df.loc[ALL_BOW_LR_results_df.rank_test_score == 1, 'params'].values[0]
ALL_BOW_LR_train_MSE = -ALL_BOW_LR_results_df.loc[ALL_BOW_LR_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
ALL_BOW_LR_test_MSE = mean_squared_error(y_test_4.values, ALL_BOW_LR_grid_search.predict(x_test_4))

results_df.loc[len(results_df.index)] = (['All variables (NLP Bag of Words) + LR', numerical_cols + categorical_cols + ['description'], \
                                          ALL_BOW_LR_params, ALL_BOW_LR_train_MSE, ALL_BOW_LR_test_MSE])

In [265]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, variety, winery, year]",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, variety, winery, year]","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,[description],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018
9,All variables (NLP Bag of Words) + LR,"[price_log, country, province, region_1, variety, winery, year, description]","{'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",0.005578,0.005553


## All variables TF-IDF with Linear Regression

In [266]:
x_vectorizer_TFIDF = Pipeline([
    ('squeezer', FunctionTransformer(lambda i: i.squeeze())),
    ('vectorizer', TfidfVectorizer(stop_words = 'english')),
    ('convertor', FunctionTransformer(lambda i: i.toarray())),
])

x_tranformer_TFIDF = ColumnTransformer(
    transformers=[
        ('numerical', MinMaxScaler(), numerical_cols),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency = 100), categorical_cols),
        ('text', x_vectorizer_TFIDF, ['description']),
], remainder='drop')

x_tranformer_TFIDF

In [267]:
ALL_TFIDF_LR_pipe = Pipeline([
    ('transformer', x_tranformer_TFIDF),
    ('regressor',['passthrough'])
])
ALL_TFIDF_LR_pipe

In [268]:
ALL_TFIDF_LR_param_grid = ALL_BOW_LR_param_grid 

ALL_TFIDF_LR_grid_search = GridSearchCV(ALL_TFIDF_LR_pipe, param_grid = ALL_TFIDF_LR_param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 3)
ALL_TFIDF_LR_grid_search

In [270]:
ALL_TFIDF_LR_grid_search.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('transformer',
   ColumnTransformer(transformers=[('numerical', MinMaxScaler(), ['price_log']),
                                   ('categorical',
                                    OneHotEncoder(handle_unknown='ignore',
                                                  min_frequency=100,
                                                  sparse_output=False),
                                    ['country', 'province', 'region_1', 'variety',
                                     'winery', 'year']),
                                   ('text',
                                    Pipeline(steps=[('squeezer',
                                                     FunctionTransformer(func=<function <lambda> at 0x0000021E8AB345E0>)),
                                                    ('vectorizer',
                                                     TfidfVectorizer(stop_words='english')),
                       

In [271]:
ALL_TFIDF_LR_grid_search.fit(x_train_4 ,y_train_4.points)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time=  55.1s
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time= 1.2min
[CV 3/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time= 1.0min
[CV 4/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time= 1.1min
[CV 5/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=300;, score=-0.008 total time=  56.9s
[CV 1/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, score=-0.007 total time= 2.2min
[CV 2/5] END regressor=Lasso(), regressor__alpha=1e-05, transformer__text__vectorizer__max_features=600;, scor

In [272]:
ALL_TFIDF_LR_results_df = pd.DataFrame(ALL_TFIDF_LR_grid_search.cv_results_)
ALL_TFIDF_LR_results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
14,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'transformer__text__vectorizer__max_features': 5000}",-0.005469,5.8e-05,1
19,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 1, 'transformer__text__vectorizer__max_features': 5000}",-0.005484,5.6e-05,2
13,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'transformer__text__vectorizer__max_features': 2000}",-0.005781,5.5e-05,3
18,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 1, 'transformer__text__vectorizer__max_features': 2000}",-0.005788,5.6e-05,4
24,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",-0.006114,5.6e-05,5
23,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 2000}",-0.006156,6e-05,6
12,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'transformer__text__vectorizer__max_features': 1000}",-0.006362,5.8e-05,7
17,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 1, 'transformer__text__vectorizer__max_features': 1000}",-0.006363,5.8e-05,8
3,"{'regressor': Lasso(), 'regressor__alpha': 1e-05, 'transformer__text__vectorizer__max_features': 2000}",-0.006439,5.5e-05,9
22,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 1000}",-0.006534,6.2e-05,10


In [273]:
ALL_TFIDF_LR_params = ALL_TFIDF_LR_results_df.loc[ALL_TFIDF_LR_results_df.rank_test_score == 1, 'params'].values[0]
ALL_TFIDF_LR_train_MSE = -ALL_TFIDF_LR_results_df.loc[ALL_TFIDF_LR_results_df.rank_test_score == 1, 'mean_test_score'].values[0]
ALL_TFIDF_LR_test_MSE = mean_squared_error(y_test_4.values, ALL_TFIDF_LR_grid_search.predict(x_test_4))

results_df.loc[len(results_df.index)] = (['All variables (NLP TFIDF) + LR', numerical_cols + categorical_cols + ['description'], \
                                          ALL_TFIDF_LR_params, ALL_TFIDF_LR_train_MSE, ALL_TFIDF_LR_test_MSE])

In [274]:
results_df

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,[mean_points],{},0.023013,0.023359
1,simple mean,[country mean_points],{},0.021859,0.022136
2,KNN,"[price_log, country, province, region_1, variety, winery, year]",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"[price_log, country, province, region_1, variety, winery, year]",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"[price_log, country, province, region_1, variety, winery, year]","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,[description],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018
9,All variables (NLP Bag of Words) + LR,"[price_log, country, province, region_1, variety, winery, year, description]","{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",0.005578,0.005553


In [275]:
results_df.to_csv("results_df.csv")