## Preparing

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Flatten, TextVectorization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model, pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import activations

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder


In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
wine_reviews = pd.read_csv("wine_reviews.csv", index_col = 0)

In [4]:
wine_reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery,year,price_log,desc_len,couotry_codes
0,Italy,"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",Vulkà Bianco,87.0,19.0,Sicily & Sardinia,Etna,Unknown,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013,2.944439,24,22
1,Portugal,"This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.",Avidagos,87.0,15.0,Douro,Unknown,Unknown,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,2.70805,38,31
2,US,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.",Unknown,87.0,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,2.639057,28,40
3,US,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.",Reserve Late Harvest,87.0,13.0,Michigan,Lake Michigan Shore,Unknown,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore),Riesling,St. Julian,2013,2.564949,33,40
4,US,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.",Vintner's Reserve Wild Child Block,87.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willamette Valley),Pinot Noir,Sweet Cheeks,2012,4.174387,41,40


In [6]:
results_df_full = pd.read_csv("results_df.csv", index_col = 0)

In [7]:
wr_work = wine_reviews[['description','country','price_log','province','region_1','variety','winery','year','points']]
wr_work.head()

Unnamed: 0,description,country,price_log,province,region_1,variety,winery,year,points
0,"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",Italy,2.944439,Sicily & Sardinia,Etna,White Blend,Nicosia,2013,87.0
1,"This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.",Portugal,2.70805,Douro,Unknown,Portuguese Red,Quinta dos Avidagos,2011,87.0
2,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.",US,2.639057,Oregon,Willamette Valley,Pinot Gris,Rainstorm,2013,87.0
3,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.",US,2.564949,Michigan,Lake Michigan Shore,Riesling,St. Julian,2013,87.0
4,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.",US,4.174387,Oregon,Willamette Valley,Pinot Noir,Sweet Cheeks,2012,87.0


In [8]:
categorical_cols = ['country', 'province', 'region_1', 'variety', 'winery', 'year']
numerical_cols = ['price_log']

In [9]:
wr_work.country = pd.Categorical(wr_work.country).codes
wr_work.province = pd.Categorical(wr_work.province).codes
wr_work.region_1 = pd.Categorical(wr_work.region_1).codes
wr_work.variety = pd.Categorical(wr_work.variety).codes
wr_work.winery = pd.Categorical(wr_work.winery).codes
wr_work.year = pd.Categorical(wr_work.year).codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wr_work.country = pd.Categorical(wr_work.country).codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wr_work.province = pd.Categorical(wr_work.province).codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wr_work.region_1 = pd.Categorical(wr_work.region_1).codes
A value is trying to be set on a 

In [10]:
minmax = MinMaxScaler()
wr_work[['price_log', 'points']] = minmax.fit_transform(wr_work[['price_log', 'points']])
wr_work.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wr_work[['price_log', 'points']] = minmax.fit_transform(wr_work[['price_log', 'points']])


Unnamed: 0,description,country,price_log,province,region_1,variety,winery,year,points
0,"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",22,0.232026,331,424,691,11608,53,0.35
1,"This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.",31,0.196825,108,1094,450,12956,51,0.35
2,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.",40,0.186551,268,1218,436,13018,53,0.35
3,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.",40,0.175516,218,549,479,14390,53,0.35
4,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.",40,0.41518,268,1218,440,14621,52,0.35


## Embedding Text without pre-train embbedings

In [13]:
max_desc_len = max(wine_reviews.desc_len)
max_desc_len

135

In [14]:
tokenizer_1000 = Tokenizer(num_words=1000)
tokenizer_1000.fit_on_texts(wr_work.description)
desc_1000 = tokenizer_1000.texts_to_sequences(wr_work.description)

In [15]:
desc_1000_max = pad_sequences(desc_1000, maxlen=max_desc_len)
desc_1000_60 = pad_sequences(desc_1000, maxlen=60)
desc_1000_60

array([[  0,   0,   0, ..., 141, 357,  18],
       [  0,   0,   0, ..., 438,  20, 419],
       [  0,   0,   0, ..., 807, 727, 480],
       ...,
       [  0,   0,   0, ...,  23,  20, 312],
       [  0,   0,   0, ...,  23,  20, 588],
       [  0,   0,   0, ..., 266,  23,  45]])

In [17]:
tokenizer_5000 = Tokenizer(num_words=5000)
tokenizer_5000.fit_on_texts(wr_work.description)
desc_5000 = tokenizer_5000.texts_to_sequences(wr_work.description)

In [18]:
desc_5000_max = pad_sequences(desc_5000, maxlen=max_desc_len)
desc_5000_60 = pad_sequences(desc_5000, maxlen=60)
desc_5000_60

array([[  0,   0,   0, ..., 141, 357,  18],
       [  0,   0,   0, ..., 438,  20, 419],
       [  0,   0,   0, ..., 807, 727, 480],
       ...,
       [  0,   0,   0, ...,  23,  20, 312],
       [  0,   0,   0, ...,  23,  20, 588],
       [  0,   0,   0, ..., 266,  23,  45]])

In [19]:
x_train, x_test, y_train, y_test = train_test_split(wr_work[categorical_cols + numerical_cols], wr_work.points, \
                                                    test_size = 0.25, shuffle = True, random_state = 78)

In [27]:
desc_1000_max_train, desc_1000_max_test = train_test_split(desc_1000_max, test_size = 0.25, shuffle = True, random_state = 78)
desc_1000_60_train, desc_1000_60_test = train_test_split(desc_1000_60, test_size = 0.25, shuffle = True, random_state = 78)
desc_5000_max_train, desc_5000_max_test = train_test_split(desc_5000_max, test_size = 0.25, shuffle = True, random_state = 78)
desc_5000_60_train, desc_5000_60_test = train_test_split(desc_5000_60, test_size = 0.25, shuffle = True, random_state = 78)

In [42]:
desc_words = [1000, 5000]
desc_len = [max_desc_len, 60]
dense_activations = ['relu', 'sigmoid']
dense_units_1 =  [8, 16, 32, 64, 128]
dense_units_2 = [4, 8, 16, 32, 64]
model_1_results_df = pd.DataFrame(columns = ['parameters', 'train_MSE', 'test_MSE'])

for a in desc_words:
    for b in desc_len:
        for c in dense_activations:
            for d in dense_units_1:
                for e in dense_units_2:
                    params = {'desc_words' : a, 'desc_len': b, 'activation': c,  'units layer 1': d, 'units layer 2': e}
                    if e > d:
                        print(f'Passing Parameters: {params}')
                        continue  
                    
                    input_1 = Input(shape=(b,))
                    embedding_1 = Embedding(input_dim = a, output_dim=10)(input_1)
                    flatten_1 = Flatten()(embedding_1)
                    dense_1a = Dense(units = d, activation = c)(flatten_1)
                    drop_1 =  Dropout(0.5)(dense_1a)
                    dense_1b = Dense(units = e, activation= c)(drop_1)
                    output_1 = Dense(units = 1, activation= 'linear')(dense_1b)
                    model_1 = Model(inputs=[input_1], outputs=output_1)

                    model_1.compile(optimizer='adam', loss='mean_squared_error')
                    
                    if a == 1000 and b == max_desc_len:
                        x_train_1, x_test_1 = desc_1000_max_train, desc_1000_max_test
                    elif a == 1000 and b == 60:
                        x_train_1, x_test_1 = desc_1000_60_train, desc_1000_60_test
                    elif a == 5000 and b == max_desc_len:
                        x_train_1, x_test_1 = desc_5000_max_train, desc_5000_max_test
                    elif a == 5000 and b == 60:
                        x_train_1, x_test_1 = desc_5000_60_train, desc_5000_60_test
                        
                    print(f'Fitting Model 1, Parameters: {params}')
                    model_1.fit(x_train_1, y_train,
                                batch_size=32,
                                epochs=10,
                                callbacks=EarlyStopping(monitor='val_loss', patience=3),
                                workers = 8,
                                verbose = 0,
                        validation_data=(x_test_1, y_test))
                    print(f'Evaluating Model 1, Parameters: {params}')
                    train_MSE = model_1.evaluate(x_train_1, y_train, verbose = 2)
                    test_MSE = model_1.evaluate(x_test_1, y_test, verbose = 2)
            
                    model_1_results_df.loc[len(model_1_results_df.index)] = ([params, train_MSE, test_MSE])


Fitting Model 1, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'relu', 'units layer 1': 8, 'units layer 2': 4}
Evaluating Model 1, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'relu', 'units layer 1': 8, 'units layer 2': 4}
3047/3047 - 3s - loss: 0.0376 - 3s/epoch - 1ms/step
1016/1016 - 1s - loss: 0.0387 - 1s/epoch - 1ms/step
Fitting Model 1, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'relu', 'units layer 1': 8, 'units layer 2': 8}
Evaluating Model 1, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'relu', 'units layer 1': 8, 'units layer 2': 8}
3047/3047 - 4s - loss: 0.0515 - 4s/epoch - 1ms/step
1016/1016 - 1s - loss: 0.0527 - 1s/epoch - 1ms/step
Passing Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'relu', 'units layer 1': 8, 'units layer 2': 16}
Passing Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'relu', 'units layer 1': 8, 'units layer 2': 32}
Passing Parameters: {'desc_w

In [43]:
model_1_results_df.sort_values(by = 'test_MSE', ascending = True).head(30)

Unnamed: 0,parameters,train_MSE,test_MSE
109,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 4}",0.004404,0.006614
144,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 16}",0.004444,0.006671
105,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 8}",0.004178,0.006674
145,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 32}",0.004514,0.006691
138,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}",0.00448,0.006697
100,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}",0.004406,0.006705
97,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 16, 'units layer 2': 4}",0.004639,0.006736
104,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 4}",0.004432,0.006773
139,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 8}",0.004229,0.006798
142,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 4}",0.004031,0.006803


In [44]:
model_1_results_df.to_csv("model_1_results_df.csv")

For the next Tests: Keep the sigmoid activation, higher numbers of units layer 1, lower numbers or unit layer 2, and 5000 desc words.

In [45]:
model_1_params = model_1_results_df.loc[model_1_results_df.test_MSE == min(model_1_results_df.test_MSE), 'parameters'].values[0]
model_1_train_MSE = model_1_results_df.loc[model_1_results_df.test_MSE == min(model_1_results_df.test_MSE), 'train_MSE'].values[0]
model_1_test_MSE = model_1_results_df.loc[model_1_results_df.test_MSE == min(model_1_results_df.test_MSE), 'test_MSE'].values[0]

results_df_full.loc[len(results_df_full.index)] = ('Embedding without pre-train', ['description'], \
                                                    model_1_params, model_1_train_MSE, model_1_test_MSE)

In [46]:
results_df_full

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,['mean_points'],{},0.023013,0.023359
1,simple mean,['country mean_points'],{},0.021859,0.022136
2,KNN,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,['description'],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018
9,All variables (NLP Bag of Words) + LR,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year', 'description']","{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",0.005578,0.005553


## Embedding all variables, Text without pre-train embbedings

In [48]:
country_input = Input(shape=(1,))
country_embedding = Embedding(input_dim=max(wr_work.country)+1, output_dim=10)(country_input)
country_flatten = Flatten()(country_embedding)

province_input = Input(shape=(1,))
province_embedding = Embedding(input_dim=max(wr_work.province)+1, output_dim=3)(province_input)
province_flatten = Flatten()(province_embedding)

region_1_input = Input(shape=(1,))
region_1_embedding = Embedding(input_dim=max(wr_work.region_1)+1, output_dim=2)(region_1_input)
region_1_flatten = Flatten()(region_1_embedding)

variety_input = Input(shape=(1,))
variety_embedding = Embedding(input_dim=max(wr_work.variety)+2, output_dim=2)(variety_input)
variety_flatten = Flatten()(variety_embedding)

winery_input = Input(shape=(1,))
winery_embedding = Embedding(input_dim=max(wr_work.winery)+1, output_dim=1)(winery_input)
winery_flatten = Flatten()(winery_embedding)

year_input = Input(shape=(1,))
year_embedding = Embedding(input_dim=max(wr_work.year)+1, output_dim=10)(year_input)
year_flatten = Flatten()(year_embedding)

price_input = Input(shape=(1,))


In [56]:
desc_words = [5000]
desc_len = [max_desc_len, 60]
dense_activations = ['sigmoid']
dense_units_1 = [32, 64, 128, 256]
dense_units_2 = [2, 4, 8, 16, 32]
model_2_results_df = pd.DataFrame(columns = ['parameters', 'train_MSE', 'test_MSE'])

for a in desc_words:
    for b in desc_len:
        for c in dense_activations:
            for d in dense_units_1:
                for e in dense_units_2:
                    params = {'desc_words' : a, 'desc_len': b, 'activation': c,  'units layer 1': d, 'units layer 2': e}

                    text_input_2 = Input(shape=(b,))
                    text_embedding_2 = Embedding(input_dim = a, output_dim=10)(text_input_2)
                    text_flatten_2 = Flatten()(text_embedding_2)
                    
                    concatenated_2 = Concatenate()([text_flatten_2, country_flatten, province_flatten, region_1_flatten, \
                                                    variety_flatten, winery_flatten, year_flatten, price_input])

                    dense_2a = Dense(units = d, activation = c)(concatenated_2)
                    drop_2 =  Dropout(0.5)(dense_2a)
                    dense_2b = Dense(units = d, activation = c)(drop_2)
                    output_2 = Dense(units = 1, activation= 'linear')(dense_2b)
                    model_2 = Model(inputs=[text_input_2, country_input, province_input, region_1_input, \
                                            variety_input, winery_input, year_input, price_input], outputs=output_2)


                    model_2.compile(optimizer='adam', loss='mean_squared_error')

                    if a == 1000 and b == max_desc_len:
                        x_train_2, x_test_2 = desc_1000_max_train, desc_1000_max_test
                    elif a == 1000 and b == 60:
                        x_train_2, x_test_2 = desc_1000_60_train, desc_1000_60_test
                    elif a == 5000 and b == max_desc_len:
                        x_train_2, x_test_2 = desc_5000_max_train, desc_5000_max_test
                    elif a == 5000 and b == 60:
                        x_train_2, x_test_2 = desc_5000_60_train, desc_5000_60_test
                    
                    print(f'Fitting Model 2, Parameters: {params}')
                    model_2.fit([x_train_2, x_train.country, x_train.province, x_train.region_1,\
                                 x_train.variety, x_train.winery, x_train.year, x_train[numerical_cols]], y_train,
                                batch_size=32,
                                epochs=10,
                                callbacks=EarlyStopping(monitor='val_loss', patience=3),
                                workers = 8,
                                verbose = 0,
                                validation_data=([x_test_2, x_test.country, x_test.province, x_test.region_1,\
                                                  x_test.variety, x_test.winery, x_test.year, x_test[numerical_cols]], y_test))
                    print(f'Evaluating Model 2, Parameters: {params}')
                    train_MSE = model_2.evaluate([x_train_2, x_train.country, x_train.province, x_train.region_1,\
                                 x_train.variety, x_train.winery, x_train.year, x_train[numerical_cols]], y_train, verbose = 2)
                    test_MSE = model_2.evaluate([x_test_2, x_test.country, x_test.province, x_test.region_1,\
                                                  x_test.variety, x_test.winery, x_test.year, x_test[numerical_cols]], y_test, verbose = 2)

                    model_2_results_df.loc[len(model_2_results_df.index)] = ([params, train_MSE, test_MSE])



Fitting Model 2, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 2}
Evaluating Model 2, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 2}
3047/3047 - 5s - loss: 0.0029 - 5s/epoch - 2ms/step
1016/1016 - 2s - loss: 0.0054 - 2s/epoch - 2ms/step
Fitting Model 2, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}
Evaluating Model 2, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}
3047/3047 - 5s - loss: 0.0031 - 5s/epoch - 2ms/step
1016/1016 - 2s - loss: 0.0053 - 2s/epoch - 2ms/step
Fitting Model 2, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 8}
Evaluating Model 2, Parameters: {'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'unit

In [57]:
model_2_results_df.sort_values(by = 'test_MSE', ascending = True).head(30)

Unnamed: 0,parameters,train_MSE,test_MSE
4,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 32}",0.0027,0.005142
20,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 2}",0.002566,0.005204
28,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 16}",0.002482,0.005252
26,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 4}",0.002381,0.005253
23,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 16}",0.002779,0.005254
3,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 16}",0.002425,0.005274
1,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}",0.003137,0.005287
21,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}",0.00255,0.005296
14,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 32}",0.002182,0.005303
33,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 16}",0.002448,0.005319


In [58]:
model_2_results_df.to_csv("model_2_results_df.csv")

In [59]:
model_2_params = model_2_results_df.loc[model_2_results_df.test_MSE == min(model_2_results_df.test_MSE), 'parameters'].values[0]
model_2_train_MSE = model_2_results_df.loc[model_2_results_df.test_MSE == min(model_2_results_df.test_MSE), 'train_MSE'].values[0]
model_2_test_MSE = model_2_results_df.loc[model_2_results_df.test_MSE == min(model_2_results_df.test_MSE), 'test_MSE'].values[0]

results_df_full.loc[len(results_df_full.index)] = ('Embedding without pre-train', categorical_cols + numerical_cols + ['description'], \
                                                    model_2_params, model_2_train_MSE, model_2_test_MSE)

In [None]:
results_df_full

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,['mean_points'],{},0.023013,0.023359
1,simple mean,['country mean_points'],{},0.021859,0.022136
2,KNN,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,['description'],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018
9,All variables (NLP Bag of Words) + LR,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year', 'description']","{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",0.005578,0.005553


## Embedding all variables, Text with GloVe pre-train embbedings Trainbale / Not Trainbale

In [102]:
glove_path = 'C:\\Users\\yibar\\Python_ML_2023\\Exercises\\Final Project\\glove.6B.100d.txt' # Path to the GloVe embedding file
glove_embedding_dim = 100

description_index = {key: value for key, value in tokenizer_5000.word_index.items() if value <= 5000}

glove_embedding_matrix = np.zeros((len(description_index) + 1, glove_embedding_dim))

with open(glove_path, 'r', encoding='utf-8') as glove_file:
    for line in glove_file:
        values = line.split()
        print(values)
        word = values[0]
        if word in description_index:
            glove_embedding_matrix[description_index[word]] = np.array(values[1:], dtype=np.float32)
glove_embedding_matrix.shape

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



(5001, 100)

In [106]:
text_input_3 = Input(shape=(135,))
text_embedding_3 = Embedding(input_dim=glove_embedding_matrix.shape[0], output_dim=glove_embedding_dim, weights=[glove_embedding_matrix], trainable=False)(text_input_3)
text_flatten_3 = Flatten()(text_embedding_3)

In [107]:
            dense_3a = Dense(units = 64, activation = '')(text_flatten_3)
            drop_3 =  Dropout(0.5)(dense_3a)
            dense_3b = Dense(units = 8, activation = 'relu')(drop_3)
            output_3 = Dense(units = 1, activation= 'linear')(dense_3b)
            model_3 = Model(inputs=text_input_3, outputs=output_3)
            
            model_3.compile(optimizer='adam', loss='mean_squared_error')
            
            model_3.summary()



Model: "model_217"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_228 (InputLayer)      [(None, 135)]             0         
                                                                 
 embedding_226 (Embedding)   (None, 135, 100)          500100    
                                                                 
 flatten_224 (Flatten)       (None, 13500)             0         
                                                                 
 dense_651 (Dense)           (None, 64)                864064    
                                                                 
 dropout_217 (Dropout)       (None, 64)                0         
                                                                 
 dense_652 (Dense)           (None, 8)                 520       
                                                                 
 dense_653 (Dense)           (None, 1)                 9 

In [108]:
            model_3.fit(desc_5000_max_train, y_train,
                        batch_size=32,
                        epochs=10,
                        callbacks=EarlyStopping(monitor='val_loss', patience=3),
                        workers = 8,
                        verbose = 1,
                        validation_data=(desc_5000_max_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.callbacks.History at 0x1df61715400>

In [113]:
trainable = [False, True]
desc_words = 5000
desc_len = [max_desc_len, 60]
dense_activations = ['sigmoid']
dense_units_1 = [32, 64, 128]
dense_units_2 = [2, 4, 8, 16, 32]
model_3_results_df = pd.DataFrame(columns = ['parameters', 'train_MSE', 'test_MSE'])

for a in trainable:
    for b in desc_len:
        for c in dense_activations:
            for d in dense_units_1:
                for e in dense_units_2:
                    params = {'desc_words' : desc_words, 'trainable': a , 'desc_len': b, 'activation': c,  'units layer 1': d, 'units layer 2': e}

                    text_input_3 = Input(shape=(b,))
                    if trainable == True:
                        text_embedding_3 = Embedding(input_dim=glove_embedding_matrix.shape[0], output_dim=glove_embedding_dim, weights=[glove_embedding_matrix], trainable=True)(text_input_3)
                    else:
                        text_embedding_3 = Embedding(input_dim=glove_embedding_matrix.shape[0], output_dim=glove_embedding_dim, weights=[glove_embedding_matrix], trainable=False)(text_input_3)
                    text_flatten_3 = Flatten()(text_embedding_3)
                    
                    concatenated_3 = Concatenate()([text_flatten_3, country_flatten, province_flatten, region_1_flatten, \
                                                    variety_flatten, winery_flatten, year_flatten, price_input])

                    dense_3a = Dense(units = d, activation = c)(concatenated_3)
                    drop_3 =  Dropout(0.5)(dense_3a)
                    dense_3b = Dense(units = d, activation = c)(drop_3)
                    output_3 = Dense(units = 1, activation= 'linear')(dense_3b)
                    model_3 = Model(inputs=[text_input_3, country_input, province_input, region_1_input, \
                                            variety_input, winery_input, year_input, price_input], outputs=output_3)


                    model_3.compile(optimizer='adam', loss='mean_squared_error')

                    if b == max_desc_len:
                        x_train_3, x_test_3 = desc_5000_max_train, desc_5000_max_test
                    elif b == 60:
                        x_train_3, x_test_3 = desc_5000_60_train, desc_5000_60_test
                    
                    print(f'Fitting Model 3, Parameters: {params}')
                    model_3.fit([x_train_3, x_train.country, x_train.province, x_train.region_1,\
                                 x_train.variety, x_train.winery, x_train.year, x_train[numerical_cols]], y_train,
                                batch_size=32,
                                epochs=10,
                                callbacks=EarlyStopping(monitor='val_loss', patience=3),
                                workers = 8,
                                verbose = 0,
                                validation_data=([x_test_3, x_test.country, x_test.province, x_test.region_1,\
                                                  x_test.variety, x_test.winery, x_test.year, x_test[numerical_cols]], y_test))
                    print(f'Evaluating Model 3, Parameters: {params}')
                    train_MSE = model_3.evaluate([x_train_3, x_train.country, x_train.province, x_train.region_1,\
                                 x_train.variety, x_train.winery, x_train.year, x_train[numerical_cols]], y_train, verbose = 2)
                    test_MSE = model_3.evaluate([x_test_3, x_test.country, x_test.province, x_test.region_1,\
                                                  x_test.variety, x_test.winery, x_test.year, x_test[numerical_cols]], y_test, verbose = 2)

                    model_3_results_df.loc[len(model_3_results_df.index)] = ([params, train_MSE, test_MSE])



Fitting Model 3, Parameters: {'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 2}
Evaluating Model 3, Parameters: {'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 2}
3047/3047 - 10s - loss: 0.0046 - 10s/epoch - 3ms/step
1016/1016 - 4s - loss: 0.0080 - 4s/epoch - 4ms/step
Fitting Model 3, Parameters: {'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}
Evaluating Model 3, Parameters: {'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}
3047/3047 - 10s - loss: 0.0050 - 10s/epoch - 3ms/step
1016/1016 - 4s - loss: 0.0085 - 4s/epoch - 4ms/step
Fitting Model 3, Parameters: {'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 8}
Evaluating Model 

In [114]:
model_3_results_df.sort_values(by = 'test_MSE', ascending = True).head(30)

Unnamed: 0,parameters,train_MSE,test_MSE
58,"{'desc_words': 5000, 'trainable': True, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 16}",0.003519,0.007485
14,"{'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 32}",0.003343,0.007503
9,"{'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 32}",0.003765,0.00752
6,"{'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 4}",0.003783,0.007536
40,"{'desc_words': 5000, 'trainable': True, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 2}",0.003424,0.007539
28,"{'desc_words': 5000, 'trainable': False, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 16}",0.003508,0.007556
22,"{'desc_words': 5000, 'trainable': False, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 8}",0.003699,0.007611
8,"{'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 16}",0.003783,0.007627
53,"{'desc_words': 5000, 'trainable': True, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 16}",0.003585,0.007673
5,"{'desc_words': 5000, 'trainable': False, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 2}",0.003799,0.007685


In [115]:
model_3_results_df.to_csv("model_3_results_df.csv")

In [117]:
model_3_params = model_3_results_df.loc[model_3_results_df.test_MSE == min(model_3_results_df.test_MSE), 'parameters'].values[0]
model_3_train_MSE = model_3_results_df.loc[model_3_results_df.test_MSE == min(model_3_results_df.test_MSE), 'train_MSE'].values[0]
model_3_test_MSE = model_3_results_df.loc[model_3_results_df.test_MSE == min(model_3_results_df.test_MSE), 'test_MSE'].values[0]

results_df_full.loc[len(results_df_full.index)] = ('Embedding with GloVe pre-train embbedings', categorical_cols + numerical_cols + ['description'], \
                                                    model_3_params, model_3_train_MSE, model_3_test_MSE)

In [118]:
results_df_full

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,['mean_points'],{},0.023013,0.023359
1,simple mean,['country mean_points'],{},0.021859,0.022136
2,KNN,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,['description'],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018
9,All variables (NLP Bag of Words) + LR,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year', 'description']","{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",0.005578,0.005553


In [57]:
model_2_results_df.sort_values(by = 'test_MSE', ascending = True).head(30)

Unnamed: 0,parameters,train_MSE,test_MSE
4,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 32}",0.0027,0.005142
20,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 2}",0.002566,0.005204
28,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 16}",0.002482,0.005252
26,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 64, 'units layer 2': 4}",0.002381,0.005253
23,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 16}",0.002779,0.005254
3,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 16}",0.002425,0.005274
1,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}",0.003137,0.005287
21,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 32, 'units layer 2': 4}",0.00255,0.005296
14,"{'desc_words': 5000, 'desc_len': 135, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 32}",0.002182,0.005303
33,"{'desc_words': 5000, 'desc_len': 60, 'activation': 'sigmoid', 'units layer 1': 128, 'units layer 2': 16}",0.002448,0.005319


In [58]:
model_2_results_df.to_csv("model_2_results_df.csv")

In [59]:
model_2_params = model_2_results_df.loc[model_2_results_df.test_MSE == min(model_2_results_df.test_MSE), 'parameters'].values[0]
model_2_train_MSE = model_2_results_df.loc[model_2_results_df.test_MSE == min(model_2_results_df.test_MSE), 'train_MSE'].values[0]
model_2_test_MSE = model_2_results_df.loc[model_2_results_df.test_MSE == min(model_2_results_df.test_MSE), 'test_MSE'].values[0]

results_df_full.loc[len(results_df_full.index)] = ('Embedding without pre-train', categorical_cols + numerical_cols + ['description'], \
                                                    model_2_params, model_2_train_MSE, model_2_test_MSE)

In [148]:
results_df_full

Unnamed: 0,method,variables,parameters,train_MSE,test_MSE
0,simple mean,['mean_points'],{},0.023013,0.023359
1,simple mean,['country mean_points'],{},0.021859,0.022136
2,KNN,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__n_neighbors': 19},0.01315,0.013031
3,LR Lasso (L1),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1e-05},0.012823,0.012928
4,LR Ridge (L2),"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']",{'regressor__alpha': 1},0.012851,0.012933
5,Random Forest,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year']","{'regressor__max_depth': 32, 'regressor__n_estimators': 1000}",0.012073,0.012055
6,NLP Bag of Words + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'vectorizer__max_features': None}",0.007156,0.007156
7,NLP TFIDF + LR,description,"{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'vectorizer__max_features': 5000}",0.007156,0.007166
8,Random Forest,['description'],"{'regressor__max_depth': 32, 'regressor__n_estimators': 500, 'vectorizer': TfidfVectorizer(max_features=5000, stop_words='english'), 'vectorizer__max_features': 5000}",0.012006,0.012018
9,All variables (NLP Bag of Words) + LR,"['price_log', 'country', 'province', 'region_1', 'variety', 'winery', 'year', 'description']","{'regressor': Ridge(alpha=0.1), 'regressor__alpha': 10, 'transformer__text__vectorizer__max_features': 5000}",0.005578,0.005553


In [149]:
results_df_full.to_csv("results_df_full.csv")