# Wide and Deep

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

In [6]:
data_cols = ['user id', 'movie id', 'rating', 'timestamp']
item_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action',
             'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
             'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi', 'Thriller', 'War', 'Western']
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip code']
user_df = pd.read_csv('mlens_k/u.user', sep='|', names=user_cols, encoding='latin-1')
item_df = pd.read_csv('mlens_k/u.item', sep='|', names=item_cols, encoding='latin-1')
data_df = pd.read_csv('mlens_k/u.data', sep='\t', names=data_cols, encoding='latin-1')
print(user_df.shape, item_df.shape, data_df.shape)
df = pd.merge(pd.merge(item_df, data_df), user_df)
df.shape

(943, 5) (1682, 24) (100000, 4)


(100000, 31)

In [9]:
# Wide
from sklearn.preprocessing import OneHotEncoder
df_wide = df[['gender', 'occupation']]
df_wide['gender_occ'] = df_wide['gender'] + "_" + df_wide['occupation']
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df_wide[['gender_occ']])
encoded_df = pd.DataFrame(onehot.transform(df_wide[['gender_occ']]).toarray(), columns=onehot.get_feature_names())
df_wide = df_wide.join(encoded_df)
df_wide.drop(['gender', 'occupation', 'gender_occ'], axis=1, inplace=True)
df_wide.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(100000, 41)

In [11]:
# Deep
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
df_deep = df[['age', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi', 'Thriller', 'War',
              'Western', 'gender', 'occupation']]
df_deep['genre'] = df_deep[['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
                            'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi',
                            'Thriller', 'War', 'Western']].idxmax(1)
df_deep.drop(columns=['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
                      'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi', 'Thriller',
                      'War', 'Western'], axis=1, inplace=True)
# Encode categorical feats
for feature in ['gender', 'occupation', 'genre']:
    encoder = LabelEncoder()
    trans_feat = encoder.fit_transform(df_deep[[feature]])
    df_deep[feature] = trans_feat
# Min-max numerical feats
for feature in ['age']:
    encoder = MinMaxScaler()
    trans_feat = encoder.fit_transform(df_deep[[feature]])
    df_deep[feature] = trans_feat

df_deep.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

(100000, 4)

In [13]:
def prepare_ind_var(X):
    wide_inputs = X[['x0_F_administrator', 'x0_F_artist', 'x0_F_educator', 'x0_F_engineer',
                     'x0_F_entertainment', 'x0_F_executive', 'x0_F_healthcare',
                     'x0_F_homemaker', 'x0_F_lawyer', 'x0_F_librarian', 'x0_F_marketing',
                     'x0_F_none', 'x0_F_other', 'x0_F_programmer', 'x0_F_retired',
                     'x0_F_salesman', 'x0_F_scientist', 'x0_F_student', 'x0_F_technician',
                     'x0_F_writer', 'x0_M_administrator', 'x0_M_artist', 'x0_M_doctor',
                     'x0_M_educator', 'x0_M_engineer', 'x0_M_entertainment',
                     'x0_M_executive', 'x0_M_healthcare', 'x0_M_homemaker', 'x0_M_lawyer',
                     'x0_M_librarian', 'x0_M_marketing', 'x0_M_none', 'x0_M_other',
                     'x0_M_programmer', 'x0_M_retired', 'x0_M_salesman', 'x0_M_scientist',
                     'x0_M_student', 'x0_M_technician', 'x0_M_writer']].values
    cat_input1 = X[['gender']].values
    cat_input2 = X[['occupation']].values
    cat_input3 = X[['genre']].values
    num_input = X[['age']].values
    return wide_inputs, cat_input1, cat_input2, cat_input3, num_input

In [14]:
# Split
from sklearn.model_selection import train_test_split
X = pd.concat([df_wide, df_deep], axis=1)
y = df[['rating']]
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
wide_inputs_train, cat_input1_train, cat_input2_train, cat_input3_train, num_input_train = prepare_ind_var(xtrain)
wide_inputs_test, cat_input1_test, cat_input2_test, cat_input3_test, num_input_test = prepare_ind_var(xtest)
ytrain = ytrain.values
ytest = ytest.values


### Model Arch

In [15]:
from keras.layers import Dense, Input, Flatten, Dropout, Embedding, concatenate, merge
# Deep Network
cat_values = 50
emb_dim = 64
max_sequence = 1

cat_input1 = Input(shape=(1,), name='cat_input1')
embedding1 = Embedding(cat_values, emb_dim, input_length=max_sequence, name='emb1')(cat_input1)
embedding1 = Flatten()(embedding1)
cat_input2 = Input(shape=(1,), name='cat_input2')
embedding2 = Embedding(cat_values, emb_dim, input_length=max_sequence, name='emb2')(cat_input2)
embedding2 = Flatten()(embedding2)
cat_input3 = Input(shape=(1,), name='cat_input3')
embedding3 = Embedding(cat_values, emb_dim, input_length=max_sequence, name='emb3')(cat_input3)
embedding3 = Flatten()(embedding3)

num_input = Input(shape=(1,), name='num_input')
concat_emb = concatenate([embedding1, embedding2, embedding3, num_input])
concat_emb = Dropout(0.2)(concat_emb)

x1 = Dense(64, activation='relu')(concat_emb)
x1 = Dropout(0.2)(x1)
x2 = Dense(64, activation='relu')(x1)
x2 = Dropout(0.2)(x2)
x3 = Dense(64, activation='relu')(x2)
x3 = Dropout(0.2)(x3)
x4 = Dense(64, activation='relu')(merge.add([x1, x3]))
x4 = Dropout(0.2)(x4)
x5 = Dense(64, activation='relu')(x4)
x5 = Dropout(0.2)(x5)
x6 = Dense(64, activation='relu')(x5)
x6 = Dropout(0.2)(x6)
x7 = Dense(64, activation='relu')(merge.add([x4, x6]))
x7 = Dropout(0.2)(x7)
x8 = Dense(64, activation='relu')(x7)
x8 = Dropout(0.2)(x8)
x9 = Dense(64, activation='relu')(x8)
x9 = Dropout(0.2)(x9)
deep_output = Dense(64, activation='relu')(x9)


In [16]:
# Wide Network
num_features = len(df_wide.columns)
wide_inputs = Input(shape=(num_features,), name='wide_inputs')

In [17]:
# Combined Network
x = concatenate([wide_inputs, deep_output])
x = Dropout(0.2)(x)
wad_out = Dense(1, activation='relu')(x)
wad_model = keras.Model(inputs=[wide_inputs]+[cat_input1]+[cat_input2]+[cat_input3]+[num_input], outputs=wad_out)
wad_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cat_input1 (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
cat_input2 (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
cat_input3 (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
emb1 (Embedding)                (None, 1, 64)        3200        cat_input1[0][0]                 
______________________________________________________________________________________________

In [18]:
wad_model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [21]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from datetime import datetime, timedelta

timestamp = (datetime.utcnow() + timedelta(hours=8)).strftime('[%Y-%m-%d %H-%M-%S]')
tensorboard = TensorBoard(log_dir=f'./logs/{timestamp}')
early_stopping = EarlyStopping(monitor='val_loss', patience=20)
model_checkpoint = ModelCheckpoint(filepath='./models/wide_and_deep.h5', monitor='val_loss', save_weights_only=True, save_best_only=True)
callbacks = [model_checkpoint, early_stopping, tensorboard]

wad_model.fit(x={'wide_inputs': wide_inputs_train,
                'cat_input1': cat_input1_train,
                'cat_input2': cat_input2_train,
                'cat_input3': cat_input3_train,
                'num_input': num_input_train},
                y=ytrain,
                batch_size=32, epochs=20, verbose=1,
                callbacks=callbacks, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c9b4ee6588>

In [22]:
wad_model.evaluate(x={'wide_inputs': wide_inputs_test,
                'cat_input1': cat_input1_test,
                'cat_input2': cat_input2_test,
                'cat_input3': cat_input3_test,
                'num_input': num_input_test},
                y=ytest,
                batch_size=32, verbose=1)



[1.2076777219772339, 1.2076777219772339]

In [49]:
test_input = X.iloc[125:126]
wit, cit1, cit2, cit3, nit = prepare_ind_var(test_input)
cit3_alt = np.expand_dims(np.array([1.]), axis=0)
wit, cit1, cit2, cit3, nit

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0.]]),
 array([[1]]),
 array([[15]]),
 array([[4]]),
 array([[0.8030303]]))

In [50]:
cit3_alt

array([[1.]])

In [51]:
preds = wad_model.predict(x={'wide_inputs': wit,
                'cat_input1': cit1,
                'cat_input2': cit2,
                'cat_input3': cit3_alt,
                'num_input': nit},
                verbose=1)



In [41]:
preds.shape

(1, 1)

In [52]:
# Expected rating for user 125 if genre was different
preds

array([[3.4185853]], dtype=float32)