In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('Resources/uswhites.csv')

In [5]:
df = df.drop(['ID', 'category', 'country', 'description', 'designation', 'subsubregion', 'title', 'url', 'winery'], axis=1)
df.head()


Unnamed: 0,alcohol,price,rating,region,subregion,varietal,vintage
0,14.1,50.0,94,California,Sonoma,Chardonnay,2016
1,12.6,30.0,94,Oregon,Willamette Valley,Riesling,2016
2,13.8,60.0,94,California,Napa-Sonoma,Chardonnay,2016
3,13.8,35.0,94,Washington,Columbia Valley,Bordeaux-style White Blend,2017
4,14.7,65.0,94,California,Sonoma,Chardonnay,2016


In [11]:
X_df = pd.DataFrame(df)
X_df = df.drop("price", axis=1)
X_df['subregion'] = X_df['subregion'].astype(str)
X_df['vintage'] = X_df['vintage'].astype(int)
y = df["price"]
X_df.dtypes

alcohol      float64
rating         int64
region        object
subregion     object
varietal      object
vintage        int32
dtype: object

In [12]:
from sklearn.preprocessing import LabelEncoder
X = X_df.apply(LabelEncoder().fit_transform)
X

Unnamed: 0,alcohol,rating,region,subregion,varietal,vintage
0,130,14,2,42,17,22
1,68,14,17,56,87,22
2,118,14,2,28,17,22
3,118,14,24,7,10,23
4,152,14,2,42,17,22
...,...,...,...,...,...,...
23653,147,11,2,3,17,11
23654,147,12,2,3,17,11
23655,159,12,2,3,17,11
23656,176,12,2,42,138,10


In [13]:
X = X.values.astype("float32")
print(X)
y = y.values.astype("float32")
print(y)


[[130.  14.   2.  42.  17.  22.]
 [ 68.  14.  17.  56.  87.  22.]
 [118.  14.   2.  28.  17.  22.]
 ...
 [159.  12.   2.   3.  17.  11.]
 [176.  12.   2.  42. 138.  10.]
 [139.  12.   2.   2.  17.  11.]]
[50. 30. 60. ... 29. 24. 55.]


In [14]:
input_dims = X.shape[1]

In [15]:
def baseline_model():
    model = Sequential()
    model.add(Dense(5, input_dim=input_dims, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [16]:
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=2)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X, y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Train on 21292 samples
Epoch 1/100
21292/21292 - 3s - loss: 137.7555
Epoch 2/100
21292/21292 - 3s - loss: 99.3115
Epoch 3/100
21292/21292 - 3s - loss: 92.3624
Epoch 4/100
21292/21292 - 3s - loss: 91.2501
Epoch 5/100
21292/21292 - 3s - loss: 91.0060
Epoch 6/100
21292/21292 - 3s - loss: 90.8542
Epoch 7/100
21292/21292 - 3s - loss: 90.7134
Epoch 8/100
21292/21292 - 3s - loss: 90.7637
Epoch 9/100
21292/21292 - 3s - loss: 90.7203
Epoch 10/100
21292/21292 - 3s - loss: 90.6528
Epoch 11/100
21292/21292 - 3s - loss: 90.6461
Epoch 12/100
21292/21292 - 3s - loss: 90.5157
Epoch 13/100
21292/21292 - 3s - loss: 90.6351
Epoch 14/100
21292/21292 - 3s - loss: 90.4576
Epoch 15/100
21292/21292 - 3s - loss: 90.5049
Epoch 16/100
21292/21292 - 3s - loss: 90.5095
Epoch 17/100
21292/21292 - 3s - loss: 90.3657
Epoch 18/100
21292/21292 - 3s - loss: 90.3134
Epoch 19/100
21292/21292 - 3s - loss: 90.4348
Epoch 20/100
21292/21292 - 3s - loss: 90.4044
Epoch 21/100
21292/21292 - 3s - loss: 90.2419
Epoch 22/100
21292/

In [22]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Standardized: -93.88 (9.90) MSE


In [24]:
def larger_model():
    model = Sequential()
    model.add(Dense(10, input_dim=input_dims, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [25]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=10, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Larger: -89.62 (9.53) MSE


In [26]:
def larger_model_2():
    model = Sequential()
    model.add(Dense(10, input_dim=input_dims, kernel_initializer='random_uniform', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [28]:
estimators = []
estimators.append(('standardize', MinMaxScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model_2, epochs=10, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Larger_2: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18926 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 18927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Larger_2: -103.93 (11.85) MSE
