In [1]:
import pandas as pd
import numpy as np
import json
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from collections import OrderedDict
from sklearn.utils.class_weight import compute_class_weight

In [2]:
train_data = pd.read_csv('../data/train.csv')

In [3]:
label = train_data['label'].values
label

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [4]:
classes = [0, 1]
class_weight = compute_class_weight(class_weight='balanced', classes=classes, y=label)
class_weight

array([0.7770762 , 1.40227886])

In [5]:
pretrain=True
learning_rate = 0.0001
batch_size = 64
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
with open('../data/feature_map.json') as obj:
    feature_map = json.load(obj)
feature_map

{'dataset_id': 'bookcrossing',
 'num_fields': 2,
 'feature_specs': {'user_id': {'source': 'user',
   'type': 'categorical',
   'vocab_size': 278855,
   'index': 0},
  'book_id': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 271361,
   'index': 1}}}

In [7]:
# use only movieId and userId
feature_map['feature_specs'] = {
    'user_id': feature_map['feature_specs']['user_id'],
    'book_id': feature_map['feature_specs']['book_id']
}

feature_map['num_fields'] = 2

In [8]:
class BookCrossingDataset(Dataset):
    def __init__(self, url):
        self.df = pd.read_csv(url, low_memory=False)

    def __getitem__(self, idx):
        x, y = self.df.iloc[idx, :-1].values.astype(np.float32), self.df.iloc[idx, -1].astype(np.float32)
        return x, y

    def __len__(self):
        return self.df.shape[0]

In [9]:
train_dataset = BookCrossingDataset('../data/train.csv')
test_dataset = BookCrossingDataset('../data/test.csv')

In [10]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

In [11]:
class NCF(nn.Module):
    def __init__(self,
                 feature_map,
                 embedding_dim=10,
                 hidden_units=[256, 128, 64]):
        super(NCF, self).__init__()
        self.feature_map = feature_map
        # Embedding
        self.embedding = nn.ModuleDict()
        for feature, feature_spec in feature_map['feature_specs'].items():
            if feature_spec['type'] == 'numerical':
                self.embedding[feature] = nn.Linear(
                    1, embedding_dim, bias=False)
            elif feature_spec['type'] == 'categorical':
                padding_idx = feature_spec.get('padding_idx', None)
                self.embedding[feature] = nn.Embedding(feature_spec['vocab_size'],
                                                       embedding_dim,
                                                       padding_idx=padding_idx)
        # DNN
        input_dim = feature_map['num_fields'] * embedding_dim
        hidden_units = [input_dim] + hidden_units
        hidden_layers = []
        for i in range(len(hidden_units) - 1):
            hidden_layers.append(nn.Linear(hidden_units[i], hidden_units[i + 1]))
            hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(hidden_units[-1], 1))
        self.dnn = nn.Sequential(*hidden_layers)
        # Sigmoid
        self.output_activation = nn.Sigmoid()

    def forward(self, X):
        feature_emb_list = []
        for feature, feature_spec in self.feature_map['feature_specs'].items():
            if feature_spec['type'] == 'numerical':
                raw_feature = X[:, feature_spec['index']].float().view(-1, 1)
            elif feature_spec['type'] == 'categorical':
                raw_feature = X[:, feature_spec['index']].long()
            embedding_vec = self.embedding[feature](raw_feature)
            feature_emb_list.append(embedding_vec)
        feature_emb = torch.stack(feature_emb_list, dim=1)
        out = self.dnn(feature_emb.flatten(start_dim=1))
        y_pred = self.output_activation(out).squeeze(1)
        return y_pred

In [12]:
class BCEWithWeight(nn.Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight # The weight of positive and negative samples in the binary classification

    def forward(self, input, target):
        input = torch.clamp(input, min=1e-7, max=1-1e-7)
        bce = - self.weight[1] * target * torch.log(input) - (1 - target) * self.weight[0] * torch.log(1 - input)
        return torch.mean(bce)

In [13]:
model = NCF(feature_map).to(device)
# Loss and optimizer
criterion = BCEWithWeight(class_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
if pretrain:
    model.load_state_dict(torch.load('ncf.pt'))
else:
    # Train the model
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (X, y) in enumerate(train_loader):
            X = X.to(device)
            y = y.to(device)

            # Forward pass
            output = model(X)
            loss = criterion(output, y)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i + 1) % 1000 == 0:
                print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(
                    epoch + 1, num_epochs, i + 1, total_step, loss.item()))

In [15]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X, y in test_loader:
        X = X.to(device)
        y = y.to(device).bool()
        output = model(X)
        y_pred = output > 0.7
        total += y.shape[0]
        correct += (y_pred == y).sum().item()

    print('Accuracy of the model on the test samples: {:.2f} %'.format(
        100 * correct / total))

Accuracy of the model on the test samples: 70.16 %


In [16]:
torch.save(model.cpu().state_dict(), 'ncf.pt')