# Wide&Deep

<img src="../../img/wide_deep.png">

- Wide 部分。让模型具有较强的“记忆能力”。“记忆能力”可以被理解为模型直接学习并利用历史数据中物品或者特征的“共现频率”的能力
- Deep 部分。让模型具有“泛化能力”。利用较少的特征工程，DNN 可以通过稀疏特征学习到的低维稠密向量生成更好的未知特征组合。“泛化能力”可以被理解为模型传递特征的相关性，以及发掘稀疏甚至从未出现过的稀有特征与最终标签相关性的能力

## Pytorch

In [1]:
import pandas as pd
import numpy as np
import json
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from collections import OrderedDict

In [2]:
learning_rate = 0.0001
batch_size = 64
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
with open('../data/movielens/feature_map.json') as obj:
    feature_map = json.load(obj)
feature_map

{'dataset_id': 'movielens',
 'num_fields': 26,
 'feature_specs': {'movieId': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 935,
   'index': 0},
  'userId': {'source': 'user',
   'type': 'categorical',
   'vocab_size': 22540,
   'index': 1},
  'rating': {'source': 'user',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 2},
  'timestamp': {'source': 'user',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 3},
  'releaseYear': {'source': 'item',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 4},
  'movieGenre1': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 18,
   'index': 5},
  'movieGenre2': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 18,
   'index': 6},
  'movieGenre3': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 15,
   'index': 7},
  'movieRatingCount': {'source': 'item',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 8},
  'movieAvgRating': {'source': 'item',
   'type': 'numerical',
 

In [4]:
class MovielensDataset(Dataset):
    def __init__(self, url):
        self.df = pd.read_csv(url)
    
    def __getitem__(self, idx):
        x, y = self.df.iloc[idx, :-1].values.astype(np.float32), self.df.iloc[idx, -1].astype(np.float32)
        return x, y
    
    def __len__(self):
        return self.df.shape[0]

In [5]:
train_dataset = MovielensDataset('../data/movielens/data_for_train.csv')
test_dataset = MovielensDataset('../data/movielens/data_for_test.csv')

In [6]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

In [7]:
class WideDeep(nn.Module):
    def __init__(self,
                 feature_map,
                 embedding_dim=10,
                 hidden_units=[256, 128, 64]):
        super(WideDeep, self).__init__()
        self.feature_map = feature_map
        # Embedding
        self.embedding = nn.ModuleDict()
        for feature, feature_spec in feature_map['feature_specs'].items():
            if feature_spec['type'] == 'numerical':
                self.embedding[feature] = nn.Linear(1, embedding_dim, bias=False)
            elif feature_spec['type'] == 'categorical':
                padding_idx = feature_spec.get('padding_idx', None)
                self.embedding[feature] = nn.Embedding(feature_spec['vocab_size'],
                                                       embedding_dim,
                                                       padding_idx=padding_idx)
        # Wide Part
        self.batch_norm = nn.BatchNorm1d(feature_map['num_fields'])
        self.wide_part = nn.Linear(feature_map['num_fields'], 1)
        # Deep Part
        input_dim = feature_map['num_fields'] * embedding_dim
        hidden_units = [input_dim] + hidden_units
        deep_layers = []
        for i in range(len(hidden_units) - 1):
            deep_layers.append(nn.Linear(hidden_units[i], hidden_units[i + 1]))
            deep_layers.append(nn.ReLU())
        deep_layers.append(nn.Linear(hidden_units[-1], 1))
        self.deep_part = nn.Sequential(*deep_layers)
        # Sigmoid
        self.output_activation = nn.Sigmoid()

    def forward(self, X):
        feature_emb_list = []
        for feature, feature_spec in self.feature_map['feature_specs'].items():
            if feature_spec['type'] == 'numerical':
                raw_feature = X[:, feature_spec['index']].float().view(-1, 1)
            elif feature_spec['type'] == 'categorical':
                raw_feature = X[:, feature_spec['index']].long()
            embedding_vec = self.embedding[feature](raw_feature)
            feature_emb_list.append(embedding_vec)
        feature_emb = torch.stack(feature_emb_list, dim=1)
        out = self.wide_part(self.batch_norm(X))
        out += self.deep_part(feature_emb.flatten(start_dim=1))
        y_pred = self.output_activation(out).squeeze(1)
        return y_pred

In [8]:
model = WideDeep(feature_map).to(device)
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (X, y) in enumerate(train_loader):
        X = X.to(device)
        y = y.to(device)

        # Forward pass
        output = model(X)
        loss = criterion(output, y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 300 == 0:
            print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(
                epoch + 1, num_epochs, i + 1, total_step, loss.item()))

Epoch [1/5], Step [300/1388] Loss: 0.2630
Epoch [1/5], Step [600/1388] Loss: 0.0458
Epoch [1/5], Step [900/1388] Loss: 0.0240
Epoch [1/5], Step [1200/1388] Loss: 0.0078
Epoch [2/5], Step [300/1388] Loss: 0.0144
Epoch [2/5], Step [600/1388] Loss: 0.0547
Epoch [2/5], Step [900/1388] Loss: 0.0086
Epoch [2/5], Step [1200/1388] Loss: 0.0035
Epoch [3/5], Step [300/1388] Loss: 0.0040
Epoch [3/5], Step [600/1388] Loss: 0.0041
Epoch [3/5], Step [900/1388] Loss: 0.0203
Epoch [3/5], Step [1200/1388] Loss: 0.0010
Epoch [4/5], Step [300/1388] Loss: 0.0013
Epoch [4/5], Step [600/1388] Loss: 0.0008
Epoch [4/5], Step [900/1388] Loss: 0.0006
Epoch [4/5], Step [1200/1388] Loss: 0.0006
Epoch [5/5], Step [300/1388] Loss: 0.0004
Epoch [5/5], Step [600/1388] Loss: 0.0001
Epoch [5/5], Step [900/1388] Loss: 0.0018
Epoch [5/5], Step [1200/1388] Loss: 0.0034


In [9]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X, y in test_loader:
        X = X.to(device)
        y = y.to(device).bool()
        output = model(X)
        y_pred = output > 0.5
        total += y.shape[0]
        correct += (y_pred == y).sum().item()

    print('Accuracy of the model on the test images: {:.2f} %'.format(
        100 * correct / total))

Accuracy of the model on the test images: 99.88 %


## Tensorflow

### 读取数据

In [1]:
import tensorflow as tf

In [2]:
# Training samples path, change to your local path
training_samples_file_path = tf.keras.utils.get_file(
    'train.csv', '../data/movielens/train.csv')
# Test samples path, change to your local path
test_samples_file_path = tf.keras.utils.get_file('test.csv',
                                                 '../data/movielens/test.csv')


# load sample as tf dataset
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(file_path,
                                                    batch_size=12,
                                                    label_name='label',
                                                    na_value='0',
                                                    num_epochs=1,
                                                    ignore_errors=True)
    return dataset


# split as test dataset and training dataset
train_data = get_dataset(training_samples_file_path)
test_data = get_dataset(test_samples_file_path)

### 特征工程

In [3]:
# genre features vocabulary
genre_vocab = [
    'Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy',
    'Western', 'Documentary', 'Sci-Fi', 'Drama', 'Thriller', 'Crime',
    'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical'
]

GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

# all categorical features
categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)

# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(
    key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(
    key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
categorical_columns.append(user_emb_col)

# all numerical features
numerical_columns = [
    tf.feature_column.numeric_column('releaseYear'),
    tf.feature_column.numeric_column('movieRatingCount'),
    tf.feature_column.numeric_column('movieAvgRating'),
    tf.feature_column.numeric_column('movieRatingStddev'),
    tf.feature_column.numeric_column('userRatingCount'),
    tf.feature_column.numeric_column('userAvgRating'),
    tf.feature_column.numeric_column('userRatingStddev')
]

# cross feature between current movie and user historical movie
rated_movie = tf.feature_column.categorical_column_with_identity(
    key='userRatedMovie1', num_buckets=1001)
crossed_feature = tf.feature_column.indicator_column(
    tf.feature_column.crossed_column([movie_col, rated_movie], 10000))

### 模型训练与预测

In [4]:
# define input for keras model
inputs = {
    'movieAvgRating':
    tf.keras.layers.Input(name='movieAvgRating', shape=(), dtype='float32'),
    'movieRatingStddev':
    tf.keras.layers.Input(name='movieRatingStddev', shape=(), dtype='float32'),
    'movieRatingCount':
    tf.keras.layers.Input(name='movieRatingCount', shape=(), dtype='int32'),
    'userAvgRating':
    tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
    'userRatingStddev':
    tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
    'userRatingCount':
    tf.keras.layers.Input(name='userRatingCount', shape=(), dtype='int32'),
    'releaseYear':
    tf.keras.layers.Input(name='releaseYear', shape=(), dtype='int32'),
    'movieId':
    tf.keras.layers.Input(name='movieId', shape=(), dtype='int32'),
    'userId':
    tf.keras.layers.Input(name='userId', shape=(), dtype='int32'),
    'userRatedMovie1':
    tf.keras.layers.Input(name='userRatedMovie1', shape=(), dtype='int32'),
    'userGenre1':
    tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'),
    'userGenre2':
    tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'),
    'userGenre3':
    tf.keras.layers.Input(name='userGenre3', shape=(), dtype='string'),
    'userGenre4':
    tf.keras.layers.Input(name='userGenre4', shape=(), dtype='string'),
    'userGenre5':
    tf.keras.layers.Input(name='userGenre5', shape=(), dtype='string'),
    'movieGenre1':
    tf.keras.layers.Input(name='movieGenre1', shape=(), dtype='string'),
    'movieGenre2':
    tf.keras.layers.Input(name='movieGenre2', shape=(), dtype='string'),
    'movieGenre3':
    tf.keras.layers.Input(name='movieGenre3', shape=(), dtype='string'),
}

# wide and deep model architecture
# deep part for all input features
deep = tf.keras.layers.DenseFeatures(numerical_columns +
                                     categorical_columns)(inputs)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
# wide part for cross feature
wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs)
both = tf.keras.layers.concatenate([deep, wide])
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(both)
model = tf.keras.Model(inputs, outputs)

# compile the model, set loss function, optimizer and evaluation metrics
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[
                  'accuracy',
                  tf.keras.metrics.AUC(curve='ROC'),
                  tf.keras.metrics.AUC(curve='PR')
              ])

# train the model
model.fit(train_data, epochs=5)

# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_data)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.
      format(test_loss, test_accuracy, test_roc_auc, test_pr_auc))

Epoch 1/5


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Test Loss 0.6169010400772095, Test Accuracy 0.6810160279273987, Test ROC AUC 0.7509854435920715, Test PR AUC 0.7776556015014648


In [5]:
# print some predict results
predictions = model.predict(test_data)
for prediction, goodRating in zip(predictions[:12],
                                  list(test_data)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))

Predicted good rating: 81.60%  | Actual rating label:  Good Rating
Predicted good rating: 48.66%  | Actual rating label:  Bad Rating
Predicted good rating: 93.40%  | Actual rating label:  Good Rating
Predicted good rating: 68.30%  | Actual rating label:  Good Rating
Predicted good rating: 91.08%  | Actual rating label:  Good Rating
Predicted good rating: 82.65%  | Actual rating label:  Good Rating
Predicted good rating: 82.66%  | Actual rating label:  Bad Rating
Predicted good rating: 93.71%  | Actual rating label:  Bad Rating
Predicted good rating: 91.90%  | Actual rating label:  Good Rating
Predicted good rating: 14.88%  | Actual rating label:  Good Rating
Predicted good rating: 70.62%  | Actual rating label:  Good Rating
Predicted good rating: 93.95%  | Actual rating label:  Good Rating
