In [54]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat,get_feature_names

In [42]:
df_ratings = pd.read_csv("../data/ml-1m/ratings.dat",sep="::", header=None)
df_ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [43]:
df_users = pd.read_csv("../data/ml-1m/users.dat",sep="::", header=None)
df_users.columns = ['user_id', 'gender', 'age', 'occupation','zip']
df_users.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [45]:
df = pd.merge(df_ratings, df_users, on='user_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,gender,age,occupation,zip
0,1,1193,5,978300760,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067


In [55]:
sparse_features = ["item_id", "user_id","gender", "age", "occupation", "zip"]
target = ['rating']

In [47]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [50]:
# 2.count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique()) for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [52]:
# 3.generate input data for model
train, test = train_test_split(df, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [53]:
# 4.Define Model,train,predict and evaluate
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device)
model.compile("adam", "mse", metrics=['mse'],)

history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)
print("test MSE", round(mean_squared_error(test[target].values, pred_ans), 4))

cpu
Train on 640133 samples, validate on 160034 samples, 2501 steps per epoch
Epoch 1/10
80s - loss:  1.0082 - mse:  1.0082 - val_mse:  0.8525
Epoch 2/10
90s - loss:  0.8411 - mse:  0.8411 - val_mse:  0.8405
Epoch 3/10
96s - loss:  0.8259 - mse:  0.8259 - val_mse:  0.8236
Epoch 4/10
107s - loss:  0.8132 - mse:  0.8132 - val_mse:  0.8221
Epoch 5/10
107s - loss:  0.8062 - mse:  0.8062 - val_mse:  0.8167
Epoch 6/10
105s - loss:  0.8005 - mse:  0.8006 - val_mse:  0.8106
Epoch 7/10
102s - loss:  0.7906 - mse:  0.7906 - val_mse:  0.8067
Epoch 8/10
110s - loss:  0.7634 - mse:  0.7634 - val_mse:  0.7747
Epoch 9/10
101s - loss:  0.7378 - mse:  0.7378 - val_mse:  0.7657
Epoch 10/10
99s - loss:  0.7178 - mse:  0.7178 - val_mse:  0.7646
test MSE 0.7622
