In [1]:
import pandas as pd
import numpy as np

## encode functions
##### main custom functions
def one_hot_encode(df,columns_for_one_hot):
    '''One hot encode the columns for feeding into the model
    :df: Dataframe of the data
    :columns_for_one_hot: array name of columns to input
    :return: Dataframe of the one hot encoded data with prefix = columns_for_one_hot
    '''
#     for col_index in columns_for_one_hot:
#         df[col_index]=df[col_index].replace(['0',0], 'No_{}'.format(col_index))
    ## one hot encode
    for column_for_one_hot in columns_for_one_hot:
        #  Get one hot encoding of columns B
        one_hot = pd.get_dummies(df[column_for_one_hot],prefix=column_for_one_hot)
        # Drop column B as it is now encoded
        df = df.drop(column_for_one_hot,axis = 1)
        # Join the encoded df
        df = df.join(one_hot,rsuffix=column_for_one_hot)
    return df
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

def evaluate(y_pred,Y_val,y_col):
    print('ycol is:',y_col)
    mse_sep=[round(mean_squared_error(Y_val[:,0], y_pred[:,0]),6),
         round(mean_squared_error(Y_val[:,1], y_pred[:,1]),6)]
    print('mean squared error is:', str(mse_sep))
    mae_sep=[round(mean_absolute_error(Y_val[:,0], y_pred[:,0]),6),
         round(mean_absolute_error(Y_val[:,1], y_pred[:,1]),6)]
    print('mean absolute error is:', str(mae_sep))
    mese_sep=[round(median_absolute_error(Y_val[:,0], y_pred[:,0]),6),
         round(median_absolute_error(Y_val[:,1], y_pred[:,1]),6)]
    print('median_absolute_error is:', str(mese_sep))

In [2]:
## read and encode data
dfg = pd.read_csv(
    './vintage_wine_description_processed.csv',index_col=0).fillna(0)
to_encode=['country','province','region_1','region_2','variety','area']
to_drop=['designation','description','title','Location_description','location','winery']
dfg=dfg.drop(to_drop,axis=1)
dfg=one_hot_encode(dfg,to_encode)
dfg.head()

Unnamed: 0,points,price,year,temp1,temp10,temp11,temp12,temp13,temp14,temp15,...,area_Eastern France,area_Jura,area_Languedoc,area_Languedoc-Roussillon,area_Loire,area_Lyonnais,area_Provence,area_Rhône,area_Savoy,area_South West France
0,87,0.0,2013,11.39,20.302,16.792,12.432,11.375,9.663,13.65,...,0,0,0,0,0,0,1,0,0,0
1,87,15.0,2011,4.69,12.437,7.362,4.967,5.142,6.502,8.337,...,0,0,0,0,1,0,0,0,0,0
2,87,14.0,2013,4.35,11.176,7.606,3.811,2.58,4.904,7.617,...,0,0,0,0,0,0,0,0,0,0
3,87,13.0,2013,-1.763,10.034,3.674,1.283,-2.77,-4.096,-0.363,...,0,0,0,0,0,0,0,0,0,0
4,87,65.0,2012,4.761,10.858,5.631,2.999,4.35,4.755,5.16,...,0,0,0,0,0,0,0,0,0,0


In [3]:
pd.read_csv(
    './vintage_wine_description_processed.csv',index_col=0).fillna(0).filter(regex='^(?!taste)').filter(regex='^(?!temp)').describe()


Unnamed: 0,points,price,year,latitude,longitude
count,125345.0,125345.0,125345.0,125345.0,125345.0
mean,88.485819,33.083857,2010.677578,31.65656,-45.865747
std,3.040517,40.774575,3.715205,23.260992,71.801828
min,80.0,0.0,1904.0,-44.854979,-125.002441
25%,86.0,15.0,2009.0,36.701463,-118.755997
50%,88.0,25.0,2011.0,38.645311,-74.006015
75%,91.0,40.0,2013.0,43.97928,6.158551
max,100.0,3300.0,2017.0,57.151067,178.020649


In [4]:
### split xy and filter out columns that are not needed
y_col=['points','price']
regex_out_taste='^(?!taste)'
X = dfg.filter(regex=regex_out_taste).drop(y_col, axis=1) # Training & Validation data
Y = dfg[y_col]              # Response / Target Variable
### normalize data
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X=scaler.fit_transform(X)
Y=scaler.fit_transform(Y)
# print(Y)

print(X.shape, Y.shape)

# Split training set so that we validate on 20% of the data
# Note that our algorithms will never have seen the validation 

np.random.seed(5875) # set random seed for reproducibility

from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = \
                train_test_split(X, Y, test_size=0.2)

print('Training Samples:', X_train.shape, Y_train.shape)
print('Validation Samples:', X_val.shape, Y_val.shape)

# train and test with keras

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import optimizers
# Y_train = to_categorical(Y_train)[:,0].squeeze()
model = Sequential()
model.add( Dense(units=2000,kernel_initializer='random_normal' ,activation='relu', input_shape=(2440,) ))
model.add( Dropout(0.5))
model.add( Dense(units=1000, activation='relu'))
model.add( Dropout(0.5))
model.add( Dense(units=500, activation='relu'))
model.add( Dropout(0.5))
model.add( Dense(units=2, activation='tanh') )
# optimizer = optimizers.SGD(lr=0.0001, momentum=0.01, decay=0.01, nesterov=False)
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_squared_error'])
model.fit(X_train, Y_train, epochs = 5, batch_size= 60)
y_pred= model.predict(X_val)
mse_sep=[round(mean_squared_error(Y_val[:,0], y_pred[:,0]),6),
         round(mean_squared_error(Y_val[:,1], y_pred[:,1]),6)]

print('mean squared error is:', str(mse_sep))
evaluate(y_pred,Y_val,y_col)

(125345, 2440) (125345, 2)
Training Samples: (100276, 2440) (100276, 2)
Validation Samples: (25069, 2440) (25069, 2)


Using TensorFlow backend.
W0828 18:02:28.753587 4738500032 deprecation_wrapper.py:119] From /anaconda3/envs/learnai/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0828 18:02:28.781960 4738500032 deprecation_wrapper.py:119] From /anaconda3/envs/learnai/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0828 18:02:28.784711 4738500032 deprecation_wrapper.py:119] From /anaconda3/envs/learnai/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4115: The name tf.random_normal is deprecated. Please use tf.random.normal instead.

W0828 18:02:28.802182 4738500032 deprecation_wrapper.py:119] From /anaconda3/envs/learnai/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.p

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mean squared error is: [0.015985, 0.000145]
ycol is: ['points', 'price']
mean squared error is: [0.015985, 0.000145]
mean absolute error is: [0.100671, 0.006052]
median_absolute_error is: [0.08497, 0.003512]


In [5]:
lower_lim=300
higher_lim=310
print(model.predict(X[lower_lim:higher_lim,:],batch_size=1,verbose=1)*100)
print(Y[lower_lim:higher_lim,:]*100)

[[52.12365     0.54923475]
 [45.096138    0.52003944]
 [37.309452    0.6428896 ]
 [30.78351     0.73345983]
 [34.12419     0.68318707]
 [34.631615    0.69762534]
 [41.430897    0.5506857 ]
 [41.430897    0.5506857 ]
 [42.932915    0.6374854 ]
 [41.781708    0.487818  ]]
[[35.          1.6969697 ]
 [35.          0.        ]
 [35.          0.72727273]
 [35.          0.21212121]
 [35.          0.90909091]
 [35.          0.54545455]
 [35.          0.60606061]
 [35.          0.54545455]
 [35.          0.54545455]
 [35.          1.21212121]]


In [6]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
#lasso model
lasso = Lasso()   # instantiate
lasso.fit(X_train, Y_train)                         # fit
y_pred= lasso.predict(X_val)
evaluate(y_pred,Y_val,y_col)

ycol is: ['points', 'price']
mean squared error is: [0.023469, 0.000141]
mean absolute error is: [0.125139, 0.006185]
median_absolute_error is: [0.124262, 0.004565]


In [7]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
#elasticnet model
elas = ElasticNet()   # instantiate
elas.fit(X_train, Y_train)                         # fit
y_pred = elas.predict(X_val)
evaluate(y_pred,Y_val,y_col)

ycol is: ['points', 'price']
mean squared error is: [0.023469, 0.000141]
mean absolute error is: [0.125139, 0.006185]
median_absolute_error is: [0.124262, 0.004565]


In [8]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
#ridge model
ridge = Ridge()   # instantiate
ridge.fit(X_train, Y_train)                         # fit
y_pred = ridge.predict(X_val)
evaluate(y_pred,Y_val,y_col)

ycol is: ['points', 'price']
mean squared error is: [0.015954, 0.000108]
mean absolute error is: [0.100137, 0.004624]
median_absolute_error is: [0.083973, 0.002749]


In [None]:
import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=2,n_estimators=10,verbosity=3,objective='reg:squarederror',n_jobs=8)
print('model initiated')
gbm.fit(X_train, Y_train[:,0])  
y_pred = gbm.predict(X_val)

model initiated
[23:35:06] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:09] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:13] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:17] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:21] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:25] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:29] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:32] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:35:36] INFO: src/tree/updater_prune.cc:74: t

[23:43:55] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:43:59] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:02] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:05] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:09] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:12] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:16] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:19] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:44:22] INFO: src/tree/updater_prune.cc:74: tree pruning end,

[23:52:13] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:52:16] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:52:20] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:52:23] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:52:27] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:52:30] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:54:01] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:54:04] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[23:54:08] INFO: src/tree/updater_prune.cc:74: tree pruning end,

In [None]:
import xgboost as xgb
gbm2 = xgb.XGBClassifier()
gbm2.fit(X_train, Y_train[:,1])  
y_pred2 = gbm2.predict(X_val)

In [None]:
Y_train[:,0]

In [None]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
lgbm = LGBMRegressor()           # create
lgbm_regr= MultiOutputRegressor(lgbm)
lgbm_regr.fit(X_train, Y_train)            # train
y_pred=lgbm_regr.predict(X_val)
evaluate(y_pred,Y_val)