## DeepFM

https://github.com/shenweichen/DeepCTR-Torch
https://www.ijcai.org/Proceedings/2017/0239.pdf

In [3]:
!pip install -U deepctr-torch

Collecting deepctr-torch
[?25l  Downloading https://files.pythonhosted.org/packages/66/5c/2b047c03215cebc0a9accfc9621bb963a1d6eba8ab8765b4fabe8d160e84/deepctr_torch-0.2.1-py3-none-any.whl (53kB)
[K    100% |████████████████████████████████| 61kB 579kB/s ta 0:00:011
Installing collected packages: deepctr-torch
Successfully installed deepctr-torch-0.2.1
[33mYou are using pip version 19.0.3, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [36]:
import pandas as pd
import torch
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import DeepFM

In [5]:
import sys, os, json
sys.path.append("src/")
from constants import *

In [7]:
# read train and test data
train_fn = os.path.join(PREPARED_DATA_DIR, 'user_train_data_1.h5')
df_train = pd.read_hdf(train_fn, key='stage')

test_fn = os.path.join(PREPARED_DATA_DIR, 'user_test_data_1.h5')
df_test = pd.read_hdf(test_fn, key='stage')

In [8]:
print(df_train.shape)
print(df_test.shape)

df_train.head()

(22851074, 20)
(240538, 20)


Unnamed: 0,User,Rating,Date,Movie,Rating_class,days_since_first_user_rating,sqrt_days_since_first_user_rating,rating_age_days_user,rating_age_weeks_user,rating_age_months_user,mean_ratings_user,num_ratings_user,days_since_first_item_rating,sqrt_days_since_first_item_rating,rating_age_days_item,rating_age_weeks_item,rating_age_months_item,mean_ratings_movie,weighted_mean_ratings_movie,num_ratings_movie
0,161459,4.0,2004-07-17,2138,0,23,4.795832,251,35.857143,8.366667,3.396365,28,1611,40.137264,2143,306.142857,71.433333,3.526814,3.527663,21220
1,87375,2.0,2004-03-14,3253,0,13,3.605551,617,88.142857,20.566667,4.3337,163,395,19.874607,1052,150.285714,35.066667,2.977046,2.979649,59554
2,191296,2.0,2005-12-23,1154,0,453,21.283797,455,65.0,15.166667,3.955031,108,507,22.51666,514,73.428571,17.133333,3.818879,3.790705,1695
3,27266,5.0,2004-09-26,1201,1,15,3.872983,429,61.285714,14.3,3.757806,124,1754,41.880783,2215,316.428571,73.833333,3.771652,3.77108,74899
4,175666,3.0,2004-08-03,4377,0,446,21.118712,835,119.285714,27.833333,3.280928,51,565,23.769729,1080,154.285714,36.0,3.48806,3.518392,670


In [9]:
df_train.isnull().sum()

User                                 0
Rating                               0
Date                                 0
Movie                                0
Rating_class                         0
days_since_first_user_rating         0
sqrt_days_since_first_user_rating    0
rating_age_days_user                 0
rating_age_weeks_user                0
rating_age_months_user               0
mean_ratings_user                    0
num_ratings_user                     0
days_since_first_item_rating         0
sqrt_days_since_first_item_rating    0
rating_age_days_item                 0
rating_age_weeks_item                0
rating_age_months_item               0
mean_ratings_movie                   0
weighted_mean_ratings_movie          0
num_ratings_movie                    0
dtype: int64

In [10]:
df_test.isnull().sum()

User                                 0
Rating                               0
Date                                 0
Movie                                0
Rating_class                         0
days_since_first_user_rating         0
sqrt_days_since_first_user_rating    0
rating_age_days_user                 0
rating_age_weeks_user                0
rating_age_months_user               0
mean_ratings_user                    0
num_ratings_user                     0
days_since_first_item_rating         0
sqrt_days_since_first_item_rating    0
rating_age_days_item                 0
rating_age_weeks_item                0
rating_age_months_item               0
mean_ratings_movie                   0
weighted_mean_ratings_movie          0
num_ratings_movie                    0
dtype: int64

In [12]:
sparse_features = ['User', 'Movie']
dense_features = [
'days_since_first_user_rating',
'sqrt_days_since_first_user_rating',
'rating_age_days_user', 'rating_age_weeks_user',
'rating_age_months_user', 'mean_ratings_user',
'num_ratings_user', 'days_since_first_item_rating',
'sqrt_days_since_first_item_rating',
'rating_age_days_item', 'rating_age_weeks_item',
'rating_age_months_item', 'mean_ratings_movie',
'weighted_mean_ratings_movie', 'num_ratings_movie']
target = ['Rating']

### simple Transformation for dense features

In [14]:
mms = MinMaxScaler(feature_range=(0, 1))
print('Train')
%time df_train[dense_features] = mms.fit_transform(df_train[dense_features])

print('Test')
%time df_test[dense_features] = mms.transform(df_test[dense_features])

Train


  return self.partial_fit(X, y)


CPU times: user 1min 14s, sys: 1min 52s, total: 3min 7s
Wall time: 3min 56s
Test
CPU times: user 525 ms, sys: 222 ms, total: 747 ms
Wall time: 934 ms


In [15]:
df_train.head()

Unnamed: 0,User,Rating,Date,Movie,Rating_class,days_since_first_user_rating,sqrt_days_since_first_user_rating,rating_age_days_user,rating_age_weeks_user,rating_age_months_user,mean_ratings_user,num_ratings_user,days_since_first_item_rating,sqrt_days_since_first_item_rating,rating_age_days_item,rating_age_weeks_item,rating_age_months_item,mean_ratings_movie,weighted_mean_ratings_movie,num_ratings_movie
0,161459,4.0,2004-07-17,2138,0,0.010502,0.102481,0.114664,0.114664,0.114664,0.599091,0.006867,0.718555,0.847676,0.954462,0.954462,0.954462,0.659489,0.575193,0.11487
1,87375,2.0,2004-03-14,3253,0,0.005936,0.077046,0.281864,0.281864,0.281864,0.833425,0.0412,0.176182,0.41974,0.452622,0.452622,0.452622,0.495637,0.364382,0.322688
2,191296,2.0,2005-12-23,1154,0,0.206849,0.454807,0.207857,0.207857,0.207857,0.738758,0.027213,0.226137,0.475539,0.205152,0.205152,0.205152,0.746536,0.67638,0.009021
3,27266,5.0,2004-09-26,1201,1,0.006849,0.082761,0.19598,0.19598,0.19598,0.689451,0.031282,0.782337,0.884498,0.98758,0.98758,0.98758,0.73246,0.668831,0.405877
4,175666,3.0,2004-08-03,4377,0,0.203653,0.451279,0.381453,0.381453,0.381453,0.570232,0.012716,0.252007,0.502003,0.465501,0.465501,0.465501,0.647939,0.571627,0.003464


In [16]:
# Globals
N_USERS = 480189
N_ITEMS = 17770

In [19]:
# count unique features for each sparse field and record dense feature field name

sparse_features_count = [N_USERS, N_ITEMS]
fixlen_feature_columns = [SparseFeat(
    name=feat, vocabulary_size=sparse_features_count[i],
    embedding_dim=100) for i, feat in enumerate(sparse_features)]

In [20]:
fixlen_feature_columns

[SparseFeat(name='User', vocabulary_size=480189, embedding_dim=100, use_hash=False, dtype='int32', embedding_name='User', group_name='default_group'),
 SparseFeat(name='Movie', vocabulary_size=17770, embedding_dim=100, use_hash=False, dtype='int32', embedding_name='Movie', group_name='default_group')]

In [23]:
fixlen_feature_columns += [DenseFeat(feat, 1, ) for feat in
                           dense_features]

In [24]:
fixlen_feature_columns

[SparseFeat(name='User', vocabulary_size=480189, embedding_dim=100, use_hash=False, dtype='int32', embedding_name='User', group_name='default_group'),
 SparseFeat(name='Movie', vocabulary_size=17770, embedding_dim=100, use_hash=False, dtype='int32', embedding_name='Movie', group_name='default_group'),
 DenseFeat(name='days_since_first_user_rating', dimension=1, dtype='float32'),
 DenseFeat(name='sqrt_days_since_first_user_rating', dimension=1, dtype='float32'),
 DenseFeat(name='rating_age_days_user', dimension=1, dtype='float32'),
 DenseFeat(name='rating_age_weeks_user', dimension=1, dtype='float32'),
 DenseFeat(name='rating_age_months_user', dimension=1, dtype='float32'),
 DenseFeat(name='mean_ratings_user', dimension=1, dtype='float32'),
 DenseFeat(name='num_ratings_user', dimension=1, dtype='float32'),
 DenseFeat(name='days_since_first_item_rating', dimension=1, dtype='float32'),
 DenseFeat(name='sqrt_days_since_first_item_rating', dimension=1, dtype='float32'),
 DenseFeat(name='rat

In [25]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [26]:
feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

In [27]:
feature_names

['User',
 'Movie',
 'days_since_first_user_rating',
 'sqrt_days_since_first_user_rating',
 'rating_age_days_user',
 'rating_age_weeks_user',
 'rating_age_months_user',
 'mean_ratings_user',
 'num_ratings_user',
 'days_since_first_item_rating',
 'sqrt_days_since_first_item_rating',
 'rating_age_days_item',
 'rating_age_weeks_item',
 'rating_age_months_item',
 'mean_ratings_movie',
 'weighted_mean_ratings_movie',
 'num_ratings_movie']

In [28]:
train_model_input = {name: df_train[name] for name in feature_names}
test_model_input = {name: df_test[name] for name in feature_names}

In [30]:
type(train_model_input), type(train_model_input['User'])

(dict, pandas.core.series.Series)

In [31]:
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

In [32]:
device

'cpu'

In [33]:
model = DeepFM(linear_feature_columns=linear_feature_columns,
               dnn_feature_columns=dnn_feature_columns,
               task='regression', l2_reg_embedding=1e-5, device=device)

In [34]:
model

DeepFM(
  (embedding_dict): ModuleDict(
    (Movie): Embedding(17770, 100)
    (User): Embedding(480189, 100)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (Movie): Embedding(17770, 1)
      (User): Embedding(480189, 1)
    )
  )
  (out): PredictionLayer()
  (fm): FM()
  (dnn): DNN(
    (dropout): Dropout(p=0, inplace=False)
    (linears): ModuleList(
      (0): Linear(in_features=215, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
    (activation_layers): ModuleList(
      (0): ReLU(inplace=True)
      (1): ReLU(inplace=True)
    )
  )
  (dnn_linear): Linear(in_features=128, out_features=1, bias=False)
)

In [35]:
model.compile("adam", "mse", metrics=['mse'], )

In [37]:
start = time.time()
model.fit(train_model_input, df_train[target].values,
          batch_size=5000, epochs=3, validation_split=0.0, verbose=2)
print('time taken: %0.2f' % (time.time() - start))

cpu
Train on 22851074 samples, validate on 0 samples, 4571 steps per epoch
Epoch 1/3
6518s - loss:  0.9029 - mse:  0.9028
Epoch 2/3
6370s - loss:  0.7517 - mse:  0.7517
Epoch 3/3
6279s - loss:  0.6240 - mse:  0.6240
time taken: 19224.59


In [42]:
model.metrics.items()

dict_items([('mse', <function mean_squared_error at 0x12de48598>)])

In [43]:
eval_result = model.evaluate(test_model_input, df_test[target].values,
                             batch_size=256)

In [44]:
eval_result

{'mse': 0.7576680043989446}

In [45]:
np.sqrt(eval_result['mse'])

0.8704412699309153

In [47]:
model_fn = os.path.join(MODEL_DIR, 'NN_DeepFM_FBaseline_E3.pt')
torch.save(model.state_dict(), model_fn)