In [1]:
import os, sys, json, joblib

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

In [2]:
# GLOBALS
LOCAL_DIR = '/Users/varunn/Documents/'
DATA_DIR = os.path.join(LOCAL_DIR, 'AV_Data')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
INTERIM_DIR = os.path.join(DATA_DIR, 'interim')
CHALLENGE_DATA_FN = os.path.join(TRAIN_DIR, 'challenge_data.csv')
TRAIN_DATA_FN = os.path.join(TRAIN_DIR, 'train.csv')
TEST_DATA_FN = os.path.join(DATA_DIR, 'test.csv')

In [3]:
# read data used for training baseline_model
dev_fn = os.path.join(INTERIM_DIR, 'baseline_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'baseline_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
seen_items_dct_all_fn = os.path.join(INTERIM_DIR,
                                     'seen_items_dct_all.json')
dev_df = pd.read_csv(dev_fn)
val_df = pd.read_csv(val_fn)
val_actual_items_dct = json.load(open(val_actual_items_dct_fn))
seen_items_dct = json.load(open(seen_items_dct_fn))
seen_items_dct_all = json.load(open(seen_items_dct_all_fn))

In [40]:
# convert ids to int in dicts
val_actual_items_dct = {int(idx): val for idx, val in
                        val_actual_items_dct.items()}
seen_items_dct_all = {int(idx): val for idx, val in
                      seen_items_dct_all.items()}

### create item_attr_dct for adding the item features to the data

In [7]:
# read mapping dcts
cat_cols = ['user_id', 'challenge_sequence', 'challenge',
            'programming_language', 'challenge_series_ID',
            'author_ID', 'author_gender', 'author_org_ID',
            'category_id']
d = {}
for col in cat_cols:
    inp_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    d[col] = json.load(open(inp_fn))

print(len(d))

9


In [18]:
# read challenge data
challenge_data_fn = os.path.join(TRAIN_DIR, 'challenge_data.csv')
df_challenge = pd.read_csv(challenge_data_fn)
df_challenge.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [19]:
df_challenge.rename(columns={'challenge_ID': 'challenge'}, inplace=True)
df_challenge.fillna(value={'challenge_series_ID': 'missing',
                           'author_ID': 'missing',
                           'author_gender': 'missing',
                           'author_org_ID': 'missing'}, inplace=True)
df_challenge.fillna(value={
    'total_submissions': 0, 'programming_language': 0,
    'category_id': 0}, inplace=True)

In [20]:
# mapping values to ids
cols = [x for x in list(df_challenge.columns) if x not in
        ('publish_date', 'total_submissions')]

for col in cols:
    print(col)
    df_challenge[col] = df_challenge[col].apply(lambda x: d[col][str(x)])

challenge
programming_language
challenge_series_ID
author_ID
author_gender
author_org_ID
category_id


In [26]:
# item_attr_dct
ordered_cols = ['programming_language', 'challenge_series_ID',
                'author_ID', 'author_gender', 'author_org_ID',
                'category_id', 'total_submissions']
df_challenge['attr_lst'] = df_challenge[ordered_cols].apply(
    lambda x: list(x), axis=1)

item_attr_dct = dict(zip(df_challenge['challenge'],
                         df_challenge['attr_lst']))
len(item_attr_dct)

5606

In [27]:
# save
out_fn = os.path.join(INTERIM_DIR, 'item_attr_dct.json')
json.dump(item_attr_dct, open(out_fn, 'w'))

### Data preparation for MLP model

In [29]:
# mapping challenge_sequence to id
dev_df['challenge_sequence'] = dev_df['challenge_sequence'].apply(
    lambda x: d['challenge_sequence'][str(x)])
val_df['challenge_sequence'] = val_df['challenge_sequence'].apply(
    lambda x: d['challenge_sequence'][str(x)])

# mapping attrs to item id
for i, col in enumerate(ordered_cols):
    dev_df[col] = dev_df['challenge'].apply(lambda x: item_attr_dct[x][i])
    val_df[col] = val_df['challenge'].apply(lambda x: item_attr_dct[x][i])

In [36]:
# save
dev_fn = os.path.join(INTERIM_DIR, 'mlp_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'mlp_val_df.csv')
dev_df.to_csv(dev_fn, index=False)
val_df.to_csv(val_fn, index=False)

In [91]:
print(dev_df['sample'].value_counts())
print(dev_df[dev_df['sample'] == 'test']['user_id'].nunique())
dev_df.head()

train    883057
test     397320
Name: sample, dtype: int64
39732


Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,sample,programming_language,challenge_series_ID,author_ID,author_gender,author_org_ID,category_id,total_submissions
0,4576_1,0,0,236,train,1.0,39.0,190.0,0.0,128.0,28.0,14723.0
1,4576_2,0,1,377,train,1.0,25.0,247.0,0.0,5.0,28.0,20993.0
2,4576_3,0,2,1439,train,1.0,111.0,1012.0,2.0,580.0,56.0,43409.0
3,4576_4,0,3,185,train,1.0,27.0,148.0,0.0,97.0,36.0,8897.0
4,4576_5,0,4,455,train,1.0,25.0,207.0,0.0,148.0,37.0,15086.0


### MLP Model Training

In [3]:
# read files
dev_fn = os.path.join(INTERIM_DIR, 'mlp_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'mlp_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
seen_items_dct_all_fn = os.path.join(INTERIM_DIR,
                                     'seen_items_dct_all.json')
item_attr_dct_fn = os.path.join(INTERIM_DIR, 'item_attr_dct.json')
dev_df = pd.read_csv(dev_fn)
val_df = pd.read_csv(val_fn)
val_actual_items_dct = json.load(open(val_actual_items_dct_fn))
seen_items_dct = json.load(open(seen_items_dct_fn))
seen_items_dct_all = json.load(open(seen_items_dct_all_fn))
item_attr_dct = json.load(open(item_attr_dct_fn))

In [4]:
# convert ids to int in dicts
val_actual_items_dct = {int(idx): val for idx, val in
                        val_actual_items_dct.items()}
seen_items_dct = {int(idx): val for idx, val in
                  seen_items_dct.items()}
seen_items_dct_all = {int(idx): val for idx, val in
                      seen_items_dct_all.items()}
item_attr_dct = {int(idx): val for idx, val in item_attr_dct.items()}

In [5]:
# read mapping dcts
cat_cols = ['user_id', 'challenge_sequence', 'challenge',
            'programming_language', 'challenge_series_ID',
            'author_ID', 'author_gender', 'author_org_ID',
            'category_id']
d = {}
for col in cat_cols:
    inp_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    d[col] = json.load(open(inp_fn))

print(len(d))

9


In [6]:
# GLOBALS
N_USERS = len(d['user_id'])
N_ITEMS = len(d['challenge'])
N_CHALLENGES_PER_USER = len(d['challenge_sequence'])
USE_ITEM_ATTR = False
N_PL = len(d['programming_language'])
N_CSI = len(d['challenge_series_ID'])
N_AID = len(d['author_ID'])
N_AG = len(d['author_gender'])
N_AOID = len(d['author_org_ID'])
N_CID = len(d['category_id'])
BATCH_SIZE = 500

In [7]:
# Define data loader

from torchmlp import PairwiseInteractionsMLP
from torch.utils.data import DataLoader

dev_loader = PairwiseInteractionsMLP(
    dev_df, N_ITEMS, N_CHALLENGES_PER_USER, USE_ITEM_ATTR, 
    seen_items_dct_all, item_attr_dct, seed=1)
dev_loader = DataLoader(dev_loader, batch_size=BATCH_SIZE, shuffle=True)

val_loader = PairwiseInteractionsMLP(
    val_df, N_ITEMS, N_CHALLENGES_PER_USER, USE_ITEM_ATTR,
    seen_items_dct_all, item_attr_dct, seed=1)
val_loader = DataLoader(val_loader, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
"""
from itertools import islice

for pos_cat, pos_num, neg_cat, neg_num in islice(dev_loader, 1):
    print(pos_cat)
    print('\n')
    print(pos_num)
    print('\n')
    print(neg_cat)
    print('\n')
    print(neg_num)
    print('\n')
    user = pos_cat.numpy()[:, 0]
    print(user)
"""

"\nfrom itertools import islice\n\nfor pos_cat, pos_num, neg_cat, neg_num in islice(dev_loader, 1):\n    print(pos_cat)\n    print('\n')\n    print(pos_num)\n    print('\n')\n    print(neg_cat)\n    print('\n')\n    print(neg_num)\n    print('\n')\n    user = pos_cat.numpy()[:, 0]\n    print(user)\n"

In [9]:
# Define network

def choose_embedding_size(cat_cols, cat_num_values, min_emb_dim=100):
    """
    cat_cols: list of categorical columns
    cat_num_values: list of number of unique values for each categorical column
    """

    embedded_cols = dict(zip(cat_cols, cat_num_values))
    print(embedded_cols)
    embedding_sizes = [(n_categories,
                        min(min_emb_dim, (n_categories+1)//2))
                       for _, n_categories in embedded_cols.items()]
    return embedding_sizes

In [10]:
if USE_ITEM_ATTR:
    cat_cols = ['user_id', 'challenge', 'programming_language',
                'challenge_series_ID', 'author_ID', 'author_gender',
                'author_org_ID', 'category_id', 'challenge_sequence']
    cat_num_values = [N_USERS, N_ITEMS, N_PL, N_CSI, N_AID, N_AG, N_AOID,
                      N_CID, N_CHALLENGES_PER_USER+1]
else:
    cat_cols = ['user_id', 'challenge']
    cat_num_values = [N_USERS, N_ITEMS]
embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 50)
embedding_sizes

{'user_id': 109264, 'challenge': 5606}


[(109264, 50), (5606, 50)]

In [11]:
from torchmlp import BPRModuleMLP
model = BPRModuleMLP(embedding_sizes, 1)

In [12]:
model

BPRModuleMLP(
  (embeddings): ModuleList(
    (0): Embedding(109264, 50)
    (1): Embedding(5606, 50)
  )
  (lin1): Linear(in_features=101, out_features=1, bias=True)
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
)

In [13]:
# Define training class

from trainingmlp import StepMLP
from torchmf import bpr_loss
net = StepMLP(model=model, n_items=N_ITEMS, item_attr_dct=item_attr_dct,
              n_challenges_per_user=N_CHALLENGES_PER_USER,
              use_item_attr=USE_ITEM_ATTR,
              actual_items_dct=val_actual_items_dct,
              seen_items_dct=seen_items_dct, loss_function=bpr_loss,
              lr=0.03, weight_decay=0.05, batch_size=BATCH_SIZE,
              num_predictions=3)

In [14]:
train_size, test_size = dev_df.shape[0], val_df.shape[0]
print(train_size, '\t', test_size, '\t', train_size//BATCH_SIZE)

1280377 	 20859 	 2560


In [15]:
import time

start = time.time()
net.batch_fit(train_loader=dev_loader, test_loader=val_loader,
              train_size=train_size, test_size=test_size, calc_mapk=True,
              epochs=1)
print('time taken: %0.2f' % ((time.time() - start)/60.))

  0%|          | 0/2560 [00:00<?, ?it/s]

Training begins...


2561it [1:29:34,  1.96s/it]                          
  0%|          | 0/41 [00:00<?, ?it/s]

Validation begins...
validation with mapk


42it [04:17,  5.70s/it]                        

{'epoch': 1, 'train_loss': tensor([0.0012]), 'val_mapk': 0.00015181296642536395, 'val_loss': tensor(0.0004)}
time taken: 93.87





In [83]:
from torch import tensor


def recommend(model_class, user, k:int = 10):
    """Recommends the top-k items to a specific user."""
    model_class.model.eval()

    item_lst = [x for x in range(model_class.n_items) if x not in
                model_class.seen_items_dct[user]]
    user_value = [user]
    cat_values, num_values = [], []
    start = time.time()
    for i, item in enumerate(item_lst):
        item_value = [model_class.item_attr_dct[item][i] for i, col in
                      enumerate(model_class.item_cols)]
        num_value = [model_class.item_attr_dct[item][6]]
        if model_class.use_item_attr:
            joint_value = [model_class.n_challenges_per_user]
            cat_value = user_value + [item] + item_value + joint_value
        else:
            cat_value = user_value + [item]
        cat_values.append(cat_value)
        num_values.append(num_value)

    cat_values = tensor(cat_values)
    cat_values = cat_values.long()
    num_values = tensor(num_values)
    num_values = num_values.double()
    scores = model_class.model.calc_pred(cat_values, num_values)
    scores = torch.sigmoid(scores)
    scores = scores.squeeze()
    sorted_scores = scores.argsort().tolist()
    return sorted_scores[::-1][:k]


def recommend_dot(model_class, user, k:int = 10):
    """Recommends the top-k items to a specific user."""
    model_class.model.eval()
    
    u = model_class.model.embeddings[0].weight[user, :]
    u = u.reshape((1, u.shape[0]))
    x_ui = torch.mm(u, model_class.model.embeddings[1].weight.t())
    pred = x_ui.squeeze().argsort().tolist()
    items_seen = model_class.seen_items_dct[user]
    sorted_pred = [x for x in pred if x not in items_seen]
    return sorted_pred[::-1][:k]

In [84]:
"""
scores, sorted_scores, rec_items = recommend(net, 19, 3)
print(rec_items)
scores = scores.detach().numpy()
scores.sort()
print(scores[::-1])
"""

'\nscores, sorted_scores, rec_items = recommend(net, 19, 3)\nprint(rec_items)\nscores = scores.detach().numpy()\nscores.sort()\nprint(scores[::-1])\n'

In [85]:
from metrics import mapk


def get_mapk(actual_items_dct, recommended_items_dct, k):

    actuals, preds = [], []
    for user in actual_items_dct:
        actual_item_lst = actual_items_dct[user]
        pred_item_lst = recommended_items_dct[user]
        actuals.append(actual_item_lst)
        preds.append(pred_item_lst)

    return mapk(actuals, preds, k)

In [86]:
# recommend
val_users = list(val_actual_items_dct.keys())

recommended_items_dct = {}
start = time.time()
for i, user in enumerate(val_users):
    if i % 500 == 0:
        print('num completed: ', i)
        print('time taken: %0.2f' % ((time.time() - start)/60.))
    recommended_items_dct[user] = recommend(net, user, 3)

num completed:  0
time taken: 0.00
num completed:  500
time taken: 0.25
num completed:  1000
time taken: 0.51
num completed:  1500
time taken: 0.77
num completed:  2000
time taken: 1.08
num completed:  2500
time taken: 1.35
num completed:  3000
time taken: 1.63
num completed:  3500
time taken: 1.89
num completed:  4000
time taken: 2.14
num completed:  4500
time taken: 2.42
num completed:  5000
time taken: 2.68
num completed:  5500
time taken: 2.96
num completed:  6000
time taken: 3.23
num completed:  6500
time taken: 3.51


In [81]:
# recommend_dot
val_users = list(val_actual_items_dct.keys())

recommended_items_dct2 = {}
start = time.time()
for i, user in enumerate(val_users):
    if i % 500 == 0:
        print('num completed: ', i)
        print('time taken: %0.2f' % ((time.time() - start)/60.))
    recommended_items_dct2[user] = recommend_dot(net, user, 3)

num completed:  0
time taken: 0.00
num completed:  500
time taken: 0.01
num completed:  1000
time taken: 0.03
num completed:  1500
time taken: 0.05
num completed:  2000
time taken: 0.06
num completed:  2500
time taken: 0.08
num completed:  3000
time taken: 0.10
num completed:  3500
time taken: 0.12
num completed:  4000
time taken: 0.13
num completed:  4500
time taken: 0.15
num completed:  5000
time taken: 0.16
num completed:  5500
time taken: 0.18
num completed:  6000
time taken: 0.19
num completed:  6500
time taken: 0.21


In [87]:
print(get_mapk(val_actual_items_dct, recommended_items_dct, 3))
print(get_mapk(val_actual_items_dct, recommended_items_dct2, 3))

0.00015181296642536395
0.0005593109289355513


In [18]:
val_actual_items_dct[19]

[1647, 1646, 497]