In [1]:
!pip install deepctr-torch



In [48]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

data = pd.read_csv('../data/restaurant_data_reformatted/synthetic_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,rating,food_rating,service_rating,smoker,drink_level,dress_preference,ambience,transport,marital_status,...,Soup_y,Southern_y,Southwestern_y,Spanish_y,Steaks_y,Sushi_y,Thai_y,Turkish_y,Vegetarian_y,Vietnamese_y
0,0,1,1,2,False,social drinker,formal,family,public,single,...,0,0,0,0,0,0,0,0,0,0
1,1,1,2,1,False,social drinker,elegant,family,public,single,...,0,0,0,0,0,0,0,0,0,0
2,2,2,1,1,False,social drinker,elegant,family,public,single,...,0,0,0,0,0,0,0,0,0,0
3,3,2,1,2,False,social drinker,elegant,family,car owner,single,...,0,0,0,0,0,0,0,0,0,0
4,4,2,1,1,False,social drinker,elegant,family,public,single,...,0,0,0,0,0,0,0,0,0,0


In [50]:
list(data.dtypes)

[dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('bool'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('in

In [5]:
data = data.drop(["food_rating", "service_rating"], axis=1)

In [6]:
sparse_features = [
    col for col in data.columns if col not in [
        "rating", "food_rating", "service_rating", "weight", "height"
    ]
]
dense_features = ["weight", "height"]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['rating']

In [7]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [8]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [9]:
sparse_feature_columns = [
    SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) 
    for i,feat in enumerate(sparse_features)
]

dense_feature_columns = [DenseFeat(feat, 1,) for feat in dense_features]

In [10]:
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [11]:
from deepctr_torch.models import DeepFM
from torch.nn.functional import cross_entropy

train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}

test_model_input = {name:test[name] for name in feature_names}


device = 'cpu'

model = DeepFM(linear_feature_columns,dnn_feature_columns,task='multiclass',device=device)
model.compile("adam", "mse",
              metrics=['mse'], )

history = model.fit(train_model_input,train[target].values,batch_size=256,epochs=10,verbose=2,validation_split=0.2)
pred_ans = model.predict(test_model_input, batch_size=256)

cpu
Train on 64000 samples, validate on 16000 samples, 250 steps per epoch
Epoch 1/10
104s - loss:  0.6891 - mse:  0.6891 - val_mse:  0.6491
Epoch 2/10
131s - loss:  0.4996 - mse:  0.4996 - val_mse:  0.6472
Epoch 3/10
118s - loss:  0.2246 - mse:  0.2246 - val_mse:  0.6608
Epoch 4/10
95s - loss:  0.0901 - mse:  0.0901 - val_mse:  0.6461
Epoch 5/10
92s - loss:  0.0602 - mse:  0.0602 - val_mse:  0.6455
Epoch 6/10
109s - loss:  0.0363 - mse:  0.0363 - val_mse:  0.6464
Epoch 7/10
94s - loss:  0.0196 - mse:  0.0196 - val_mse:  0.6523
Epoch 8/10
93s - loss:  0.0100 - mse:  0.0100 - val_mse:  0.6480
Epoch 9/10
93s - loss:  0.0063 - mse:  0.0063 - val_mse:  0.6496
Epoch 10/10
94s - loss:  0.0045 - mse:  0.0045 - val_mse:  0.6494


In [51]:
test_model_input

{'Unnamed: 0': 32827    32827
 58330    58330
 78953    78953
 90339    90339
 55069    55069
          ...  
 36981    36981
 45096    45096
 42171    42171
 64108    64108
 18351    18351
 Name: Unnamed: 0, Length: 20000, dtype: int64,
 'smoker': 32827    1
 58330    0
 78953    0
 90339    1
 55069    0
         ..
 36981    0
 45096    0
 42171    1
 64108    0
 18351    1
 Name: smoker, Length: 20000, dtype: int64,
 'drink_level': 32827    2
 58330    2
 78953    1
 90339    2
 55069    0
         ..
 36981    1
 45096    2
 42171    2
 64108    2
 18351    2
 Name: drink_level, Length: 20000, dtype: int64,
 'dress_preference': 32827    1
 58330    0
 78953    2
 90339    0
 55069    0
         ..
 36981    0
 45096    3
 42171    2
 64108    3
 18351    0
 Name: dress_preference, Length: 20000, dtype: int64,
 'ambience': 32827    0
 58330    0
 78953    0
 90339    0
 55069    0
         ..
 36981    0
 45096    0
 42171    0
 64108    0
 18351    0
 Name: ambience, Length: 20000

In [12]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [13]:
pred_ans

array([[1.2084347 ],
       [1.25326633],
       [0.95081556],
       ...,
       [1.06236434],
       [0.99560601],
       [1.12524641]])

In [40]:
_test_df = data[data["rating"] == 0]
_test_data = {name: _test_df[name] for name in feature_names}

ans = model.predict(_test_data, batch_size=256)

In [41]:
import numpy as np

np.mean(ans)

0.403522241515407

In [45]:
sum((1 if a[0] < 0.5 else 0) for a in ans)

10665

In [46]:
len(ans)

16834