In [1]:
import os, sys, json, joblib

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

In [2]:
# GLOBALS
LOCAL_DIR = '/Users/varunn/Documents/'
DATA_DIR = os.path.join(LOCAL_DIR, 'AV_Data')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
INTERIM_DIR = os.path.join(DATA_DIR, 'interim')
CHALLENGE_DATA_FN = os.path.join(TRAIN_DIR, 'challenge_data.csv')
TRAIN_DATA_FN = os.path.join(TRAIN_DIR, 'train.csv')
TEST_DATA_FN = os.path.join(DATA_DIR, 'test.csv')

In [5]:
# read data used for training baseline_model
dev_fn = os.path.join(INTERIM_DIR, 'baseline_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'baseline_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
seen_items_dct_all_fn = os.path.join(INTERIM_DIR,
                                     'seen_items_dct_all.json')
dev_df = pd.read_csv(dev_fn)
val_df = pd.read_csv(val_fn)
val_actual_items_dct = json.load(open(val_actual_items_dct_fn))
seen_items_dct = json.load(open(seen_items_dct_fn))
seen_items_dct_all = json.load(open(seen_items_dct_all_fn))

In [40]:
# convert ids to int in dicts
val_actual_items_dct = {int(idx): val for idx, val in
                        val_actual_items_dct.items()}
seen_items_dct_all = {int(idx): val for idx, val in
                      seen_items_dct_all.items()}

### create item_attr_dct for adding the item features to the data

In [7]:
# read mapping dcts
cat_cols = ['user_id', 'challenge_sequence', 'challenge',
            'programming_language', 'challenge_series_ID',
            'author_ID', 'author_gender', 'author_org_ID',
            'category_id']
d = {}
for col in cat_cols:
    inp_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    d[col] = json.load(open(inp_fn))

print(len(d))

9


In [18]:
# read challenge data
challenge_data_fn = os.path.join(TRAIN_DIR, 'challenge_data.csv')
df_challenge = pd.read_csv(challenge_data_fn)
df_challenge.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [19]:
df_challenge.rename(columns={'challenge_ID': 'challenge'}, inplace=True)
df_challenge.fillna(value={'challenge_series_ID': 'missing',
                           'author_ID': 'missing',
                           'author_gender': 'missing',
                           'author_org_ID': 'missing'}, inplace=True)
df_challenge.fillna(value={
    'total_submissions': 0, 'programming_language': 0,
    'category_id': 0}, inplace=True)

In [20]:
# mapping values to ids
cols = [x for x in list(df_challenge.columns) if x not in
        ('publish_date', 'total_submissions')]

for col in cols:
    print(col)
    df_challenge[col] = df_challenge[col].apply(lambda x: d[col][str(x)])

challenge
programming_language
challenge_series_ID
author_ID
author_gender
author_org_ID
category_id


In [26]:
# item_attr_dct
ordered_cols = ['programming_language', 'challenge_series_ID',
                'author_ID', 'author_gender', 'author_org_ID',
                'category_id', 'total_submissions']
df_challenge['attr_lst'] = df_challenge[ordered_cols].apply(
    lambda x: list(x), axis=1)

item_attr_dct = dict(zip(df_challenge['challenge'],
                         df_challenge['attr_lst']))
len(item_attr_dct)

5606

In [27]:
# save
out_fn = os.path.join(INTERIM_DIR, 'item_attr_dct.json')
json.dump(item_attr_dct, open(out_fn, 'w'))

### Data preparation for MLP model

In [29]:
# mapping challenge_sequence to id
dev_df['challenge_sequence'] = dev_df['challenge_sequence'].apply(
    lambda x: d['challenge_sequence'][str(x)])
val_df['challenge_sequence'] = val_df['challenge_sequence'].apply(
    lambda x: d['challenge_sequence'][str(x)])

# mapping attrs to item id
for i, col in enumerate(ordered_cols):
    dev_df[col] = dev_df['challenge'].apply(lambda x: item_attr_dct[x][i])
    val_df[col] = val_df['challenge'].apply(lambda x: item_attr_dct[x][i])

In [36]:
# save
dev_fn = os.path.join(INTERIM_DIR, 'mlp_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'mlp_val_df.csv')
dev_df.to_csv(dev_fn, index=False)
val_df.to_csv(val_fn, index=False)

### MLP Model Training

In [3]:
# read files
dev_fn = os.path.join(INTERIM_DIR, 'mlp_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'mlp_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
seen_items_dct_all_fn = os.path.join(INTERIM_DIR,
                                     'seen_items_dct_all.json')
item_attr_dct_fn = os.path.join(INTERIM_DIR, 'item_attr_dct.json')
dev_df = pd.read_csv(dev_fn)
val_df = pd.read_csv(val_fn)
val_actual_items_dct = json.load(open(val_actual_items_dct_fn))
seen_items_dct = json.load(open(seen_items_dct_fn))
seen_items_dct_all = json.load(open(seen_items_dct_all_fn))
item_attr_dct = json.load(open(item_attr_dct_fn))

In [4]:
# convert ids to int in dicts
val_actual_items_dct = {int(idx): val for idx, val in
                        val_actual_items_dct.items()}
seen_items_dct = {int(idx): val for idx, val in
                  seen_items_dct.items()}
seen_items_dct_all = {int(idx): val for idx, val in
                      seen_items_dct_all.items()}
item_attr_dct = {int(idx): val for idx, val in item_attr_dct.items()}

In [5]:
# read mapping dcts
cat_cols = ['user_id', 'challenge_sequence', 'challenge',
            'programming_language', 'challenge_series_ID',
            'author_ID', 'author_gender', 'author_org_ID',
            'category_id']
d = {}
for col in cat_cols:
    inp_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    d[col] = json.load(open(inp_fn))

print(len(d))

9


In [6]:
# GLOBALS
N_USERS = len(d['user_id'])
N_ITEMS = len(d['challenge'])
N_CHALLENGES_PER_USER = len(d['challenge_sequence'])
N_PL = len(d['programming_language'])
N_CSI = len(d['challenge_series_ID'])
N_AID = len(d['author_ID'])
N_AG = len(d['author_gender'])
N_AOID = len(d['author_org_ID'])
N_CID = len(d['category_id'])
BATCH_SIZE = 500

In [7]:
# Define data loader

from torchmlp import PairwiseInteractionsMLP
from torch.utils.data import DataLoader

dev_loader = PairwiseInteractionsMLP(
    dev_df, N_ITEMS, N_CHALLENGES_PER_USER, seen_items_dct_all,
    item_attr_dct, seed=1)
dev_loader = DataLoader(dev_loader, batch_size=BATCH_SIZE, shuffle=True)

val_loader = PairwiseInteractionsMLP(
    val_df, N_ITEMS, N_CHALLENGES_PER_USER, seen_items_dct_all,
    item_attr_dct, seed=1)
val_loader = DataLoader(val_loader, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
"""
from itertools import islice

for pos_cat, pos_num, neg_cat, neg_num in islice(dev_loader, 1):
    print(pos_cat)
    print('\n')
    print(pos_num)
    print('\n')
    print(neg_cat)
    print('\n')
    print(neg_num)
    print('\n')
    user = pos_cat.numpy()[:, 0]
    print(user)
"""

"\nfrom itertools import islice\n\nfor pos_cat, pos_num, neg_cat, neg_num in islice(dev_loader, 1):\n    print(pos_cat)\n    print('\n')\n    print(pos_num)\n    print('\n')\n    print(neg_cat)\n    print('\n')\n    print(neg_num)\n    print('\n')\n    user = pos_cat.numpy()[:, 0]\n    print(user)\n"

In [9]:
# Define network

def choose_embedding_size(cat_cols, cat_num_values, min_emb_dim=100):
    """
    cat_cols: list of categorical columns
    cat_num_values: list of number of unique values for each categorical column
    """

    embedded_cols = dict(zip(cat_cols, cat_num_values))
    print(embedded_cols)
    embedding_sizes = [(n_categories,
                        min(min_emb_dim, (n_categories+1)//2))
                       for _, n_categories in embedded_cols.items()]
    return embedding_sizes

In [10]:
cat_cols = ['user_id', 'challenge', 'programming_language',
            'challenge_series_ID', 'author_ID', 'author_gender',
            'author_org_ID', 'category_id', 'challenge_sequence']
cat_num_values = [N_USERS, N_ITEMS, N_PL, N_CSI, N_AID, N_AG, N_AOID,
                  N_CID, N_CHALLENGES_PER_USER+1]
embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 50)
embedding_sizes

{'user_id': 109264, 'challenge': 5606, 'programming_language': 3, 'challenge_series_ID': 436, 'author_ID': 3485, 'author_gender': 3, 'author_org_ID': 1718, 'category_id': 195, 'challenge_sequence': 14}


[(109264, 50),
 (5606, 50),
 (3, 2),
 (436, 50),
 (3485, 50),
 (3, 2),
 (1718, 50),
 (195, 50),
 (14, 7)]

In [11]:
from torchmlp import BPRModuleMLP
model = BPRModuleMLP(embedding_sizes, 1)

In [12]:
model

BPRModuleMLP(
  (embeddings): ModuleList(
    (0): Embedding(109264, 50)
    (1): Embedding(5606, 50)
    (2): Embedding(3, 2)
    (3): Embedding(436, 50)
    (4): Embedding(3485, 50)
    (5): Embedding(3, 2)
    (6): Embedding(1718, 50)
    (7): Embedding(195, 50)
    (8): Embedding(14, 7)
  )
  (lin1): Linear(in_features=312, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=1, bias=True)
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.6, inplace=False)
)

In [13]:
# Define training class

from trainingmlp import StepMLP
from torchmf import bpr_loss
net = StepMLP(model=model, n_items=N_ITEMS, item_attr_dct=item_attr_dct,
              n_challenges_per_user=N_CHALLENGES_PER_USER,
              actual_items_dct=val_actual_items_dct,
              seen_items_dct=seen_items_dct, loss_function=bpr_loss,
              lr=0.03, weight_decay=0.05, batch_size=BATCH_SIZE,
              num_predictions=3)

In [14]:
train_size, test_size = dev_df.shape[0], val_df.shape[0]
print(train_size, '\t', test_size, '\t', train_size//BATCH_SIZE)

1280377 	 20859 	 2560


In [15]:
import time

start = time.time()
net.batch_fit(train_loader=dev_loader, test_loader=val_loader,
              train_size=train_size, test_size=test_size, calc_mapk=True,
              epochs=2)
print('time taken: %0.2f' % ((time.time() - start)/60.))

  0%|          | 0/2560 [00:00<?, ?it/s]

Training begins...


2561it [1:41:43,  2.21s/it]                          
  0%|          | 0/41 [00:00<?, ?it/s]

Validation begins...
validation with mapk


 12%|█▏        | 5/41 [39:31<4:45:03, 475.10s/it]


KeyboardInterrupt: 

In [36]:
from torch import tensor


def recommend(model_class, user_lst, k:int = 10):
    """Recommends the top-k items to a specific user."""
    model_class.model.eval()

    for user in user_lst:
        item_lst = [x for x in range(model_class.n_items) if x not in
                    model_class.seen_items_dct[user]]
        user_value = [user]
        cat_values, num_values = [], []
        start = time.time()
        for i, item in enumerate(item_lst):
            if i % 1000 == 0:
                print('num completed: ', i)
                print('time taken: %0.2f' % ((time.time() - start)/60.))
            item_value = [model_class.item_attr_dct[item][i] for i, col in
                          enumerate(model_class.item_cols)]
            num_value = [model_class.item_attr_dct[item][6]]
            joint_value = [model_class.n_challenges_per_user]
            cat_value = user_value + [item] + item_value + joint_value
            cat_values.append(cat_value)
            num_values.append(num_value)

    cat_values = tensor(cat_values)
    cat_values = cat_values.long()
    num_values = tensor(num_values)
    num_values = num_values.double()
    scores = model_class.model.calc_pred(cat_values, num_values)
    scores = scores.squeeze()
    sorted_scores = scores.argsort().tolist()
    return sorted_scores[::-1][:k]

In [37]:
%time rec_items = recommend(net, 0, k=3)

num completed:  0
time taken: 0.00
num completed:  1000
time taken: 0.00
num completed:  2000
time taken: 0.00
num completed:  3000
time taken: 0.00
num completed:  4000
time taken: 0.00
num completed:  5000
time taken: 0.00
CPU times: user 264 ms, sys: 54.9 ms, total: 319 ms
Wall time: 123 ms


In [38]:
rec_items

[5592, 1860, 1861]

In [41]:
(0.150*40000)/3600.

1.6666666666666667