## Goal
1. To predict the next 3 challenges the user will be interested in solving

## Methodology
1. A simple feedforward NN with entity embeddings for challenge and user
2. Objective/Loss function - BPR (good for ranking problems)
3. Features - Only user id and challenge
4. Sampling - Combine train & test samples. Create a val sample by keeping aside the last 3 challenges taken by x% of the users in the train sample

In [1]:
import os, sys, json, joblib

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

In [2]:
# GLOBALS
LOCAL_DIR = '/Users/varunn/Documents/'
DATA_DIR = os.path.join(LOCAL_DIR, 'AV_Data')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
INTERIM_DIR = os.path.join(DATA_DIR, 'interim')
CHALLENGE_DATA_FN = os.path.join(TRAIN_DIR, 'challenge_data.csv')
TRAIN_DATA_FN = os.path.join(TRAIN_DIR, 'train.csv')
TEST_DATA_FN = os.path.join(DATA_DIR, 'test.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_DATA_FN)
df_test = pd.read_csv(TEST_DATA_FN)
df_challenge = pd.read_csv(CHALLENGE_DATA_FN)

### Exploration

In [4]:
print('Train\n')
print(df_train.shape)
print(df_train.head())
print('\n')

n_users = df_train['user_id'].nunique()
n_challenges = df_train['challenge'].nunique()
n_challenge_per_user = df_train.groupby('user_id')['challenge'].nunique(
    ).mean()
print('Num Users: %d' % (n_users))
print('Num Challenges: %d' % (n_challenges))
print('Num Challenges per User: %d' % (n_challenge_per_user))

Train

(903916, 4)
  user_sequence  user_id  challenge_sequence challenge
0        4576_1     4576                   1   CI23714
1        4576_2     4576                   2   CI23855
2        4576_3     4576                   3   CI24917
3        4576_4     4576                   4   CI23663
4        4576_5     4576                   5   CI23933


Num Users: 69532
Num Challenges: 5348
Num Challenges per User: 13


In [5]:
print('Test\n')
print(df_test.shape)
print(df_test.head())
print('\n')

n_users = df_test['user_id'].nunique()
n_challenges = df_test['challenge'].nunique()
n_challenge_per_user = df_test.groupby('user_id')['challenge'].nunique(
    ).mean()
print('Num Users: %d' % (n_users))
print('Num Challenges: %d' % (n_challenges))
print('Num Challenges per User: %d' % (n_challenge_per_user))

Test

(397320, 4)
  user_sequence  user_id  challenge_sequence challenge
0        4577_1     4577                   1   CI23855
1        4577_2     4577                   2   CI23933
2        4577_3     4577                   3   CI24917
3        4577_4     4577                   4   CI24915
4        4577_5     4577                   5   CI23714


Num Users: 39732
Num Challenges: 4477
Num Challenges per User: 10


In [6]:
print('check if the users in test are in train')
train_users = set(df_train['user_id'].unique().tolist())
test_users = set(df_test['user_id'].unique().tolist())

print(len(test_users - train_users), '\n')

print('check if the challenges in test are in train')
train_users = set(df_train['challenge'].unique().tolist())
test_users = set(df_test['challenge'].unique().tolist())

print(len(test_users - train_users), '\n')

check if the users in test are in train
39732 

check if the challenges in test are in train
154 



In [7]:
# challenge data
n_challenges = df_challenge['challenge_ID'].nunique()
print('num challenges: %d' % (n_challenges))
print('shape: ', df_challenge.shape)
df_challenge.head()

num challenges: 5606
shape:  (5606, 9)


Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [8]:
df_challenge.isnull().sum()

challenge_ID               0
programming_language       0
challenge_series_ID       12
total_submissions        352
publish_date               0
author_ID                 39
author_gender             97
author_org_ID            248
category_id             1841
dtype: int64

In [9]:
# number of unique values
cols = list(df_challenge.columns)
for col in cols:
    print(col)
    print('num unique values: %d' % (df_challenge[col].nunique()))
    print('\n')

challenge_ID
num unique values: 5606


programming_language
num unique values: 3


challenge_series_ID
num unique values: 435


total_submissions
num unique values: 1067


publish_date
num unique values: 1145


author_ID
num unique values: 3484


author_gender
num unique values: 2


author_org_ID
num unique values: 1717


category_id
num unique values: 194




In [10]:
# number of interactions between a user and a challenge
tmp = df_train.groupby(['user_id', 'challenge'])['user_sequence'].count(
    ).rename('num_times_challenge_taken').reset_index()
print(tmp.shape)
tmp['num_times_challenge_taken'].describe()

(903916, 3)


count    903916.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: num_times_challenge_taken, dtype: float64

### Data Preparation
1. id mapping for categorical variables
2. Normalization for numeric variables

In [20]:
# GLOBALS
CATEGORICAL_VARS = ['user_id', 'challenge_sequence', 'challenge',
                    'programming_language', 'challenge_series_ID',
                    'author_ID', 'author_gender', 'author_org_ID',
                    'category_id']
NUMERIC_VARS = ['total_submissions']

In [21]:
# append train and test data to create a unified dataset
cols = ['user_sequence', 'user_id', 'challenge_sequence', 'challenge',
        'sample']
df_train['sample'] = 'train'
df_test['sample'] = 'test'
df = pd.concat([df_train[cols], df_test[cols]], axis=0)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

print(df.shape)

(1301236, 5)


In [22]:
# id mapping for categorical variables
df_challenge.rename(columns={'challenge_ID': 'challenge'}, inplace=True)
df_challenge.fillna(value={'challenge_series_ID': 'missing',
                           'author_ID': 'missing',
                           'author_gender': 'missing',
                           'author_org_ID': 'missing'}, inplace=True)
df_challenge.fillna(value={
    'total_submissions': 0, 'programming_language': 0,
    'category_id': 0}, inplace=True)
print(df_challenge.isnull().sum())

for col in CATEGORICAL_VARS:
    print('Col: %s' % (col))
    if col in ('user_id', 'challenge_sequence'):
        values = df[col].unique().tolist()
    else:
        values = df_challenge[col].unique().tolist()
    value2idx = {value: idx for idx, value in enumerate(values)}
    idx2value = {idx: value for idx, value in enumerate(values)}
    
    print(len(value2idx), '\t', len(idx2value))
    
    print('save')
    value2idx_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    idx2value_fn = os.path.join(INTERIM_DIR, 'idx2{}.json'.format(col))
    json.dump(value2idx, open(value2idx_fn, 'w'))
    json.dump(idx2value, open(idx2value_fn, 'w'))
    print('\n')

challenge               0
programming_language    0
challenge_series_ID     0
total_submissions       0
publish_date            0
author_ID               0
author_gender           0
author_org_ID           0
category_id             0
dtype: int64
Col: user_id
109264 	 109264
save


Col: challenge_sequence
13 	 13
save


Col: challenge
5606 	 5606
save


Col: programming_language
3 	 3
save


Col: challenge_series_ID
436 	 436
save


Col: author_ID
3485 	 3485
save


Col: author_gender
3 	 3
save


Col: author_org_ID
1718 	 1718
save


Col: category_id
195 	 195
save




### Sampling for baseline model

In [7]:
# GLOBALS
TEST_USER_PROPORTION = 0.1
SEED = 1

In [8]:
# mapping discrete cols to mapped id cols

baseline_df = df.copy()

for col in ['user_id', 'challenge']:
    print('Col: %s' % (col))
    value2idx_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    d = json.load(open(value2idx_fn))
    baseline_df[col] = baseline_df[col].apply(lambda x: d[str(x)])

Col: user_id
Col: challenge


In [9]:
# sampling

mask = baseline_df['sample'] == 'train'
users = baseline_df.loc[mask, 'user_id'].unique().tolist()
np.random.seed(SEED)
users = list(np.random.permutation(users))
num_val_users = int(TEST_USER_PROPORTION*len(users))
val_users = users[:num_val_users]

mask1 = baseline_df['user_id'].isin(val_users)
mask2 = baseline_df['challenge_sequence'].isin([11, 12, 13])
val_df = baseline_df.loc[mask1&mask2, :]
val_df.reset_index(drop=True, inplace=True)
dev_df = baseline_df.loc[~(mask1&mask2), :]
dev_df.reset_index(drop=True, inplace=True)

print(dev_df.shape, '\t', val_df.shape)
print(dev_df['user_id'].nunique(), '\t', val_df['user_id'].nunique())

(1280377, 5) 	 (20859, 5)
109264 	 6953


In [10]:
# get actual items dct on val sample for mapk calculation
tmp = val_df.groupby('user_id')['challenge'].apply(list).rename(
    'challenge_lst').reset_index()
val_actual_items_dct = dict(zip(tmp['user_id'], tmp['challenge_lst']))
print(len(val_actual_items_dct))
del tmp

6953


In [11]:
# get seen items dct for mapk calculation
tmp = dev_df.groupby('user_id')['challenge'].apply(list).rename(
    'challenge_lst').reset_index()
seen_items_dct = dict(zip(tmp['user_id'], tmp['challenge_lst']))
print(len(seen_items_dct))
del tmp

109264


In [12]:
# get seen items dct for mapk calculation
tmp = baseline_df.groupby('user_id')['challenge'].apply(list).rename(
    'challenge_lst').reset_index()
seen_items_dct_all = dict(zip(tmp['user_id'], tmp['challenge_lst']))
print(len(seen_items_dct_all))
del tmp

109264


In [13]:
# save
dev_fn = os.path.join(INTERIM_DIR, 'baseline_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'baseline_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
seen_items_dct_all_fn = os.path.join(INTERIM_DIR, 'seen_items_dct_all.json')
dev_df.to_csv(dev_fn, index=False)
val_df.to_csv(val_fn, index=False)
json.dump(val_actual_items_dct, open(val_actual_items_dct_fn, 'w'))
json.dump(seen_items_dct, open(seen_items_dct_fn, 'w'))
json.dump(seen_items_dct_all, open(seen_items_dct_all_fn, 'w'))

### Baseline Model Training

In [5]:
# GLOBALS
N_USERS = 109264
N_ITEMS = 5606
BATCH_SIZE = 500

In [6]:
# read files
dev_fn = os.path.join(INTERIM_DIR, 'baseline_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'baseline_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
dev_df = pd.read_csv(dev_fn)
val_df = pd.read_csv(val_fn)
val_actual_items_dct = json.load(open(val_actual_items_dct_fn))
seen_items_dct = json.load(open(seen_items_dct_fn))

In [7]:
# convert ids to int in dicts
val_actual_items_dct = {int(idx): val for idx, val in
                        val_actual_items_dct.items()}
seen_items_dct = {int(idx): val for idx, val in seen_items_dct.items()}

In [8]:
# Define data loader

from torchmf import PairwiseInteractions
from scipy import sparse as sp

def get_interactions(data, n_users, n_items):

    interactions = np.zeros((n_users, n_items))
    for _, row in data.iterrows():
        interactions[row['user_id'], row['challenge']] = 1
    return sp.coo_matrix(interactions)

In [9]:
%time dev_interactions = get_interactions(dev_df, N_USERS, N_ITEMS)
%time val_interactions = get_interactions(val_df, N_USERS, N_ITEMS)

CPU times: user 2min 39s, sys: 3.74 s, total: 2min 42s
Wall time: 2min 43s
CPU times: user 8.28 s, sys: 1.96 s, total: 10.2 s
Wall time: 10.2 s


In [10]:
from torch.utils.data import DataLoader

%time dev_loader = PairwiseInteractions(dev_interactions)
%time dev_loader = DataLoader(dev_loader, batch_size=BATCH_SIZE, shuffle=True)

%time val_loader = PairwiseInteractions(val_interactions)
%time val_loader = DataLoader(val_loader, batch_size=BATCH_SIZE, shuffle=True)

CPU times: user 21.1 ms, sys: 9.04 ms, total: 30.1 ms
Wall time: 28.4 ms
CPU times: user 91 µs, sys: 1 µs, total: 92 µs
Wall time: 97 µs
CPU times: user 1.65 ms, sys: 450 µs, total: 2.1 ms
Wall time: 1.88 ms
CPU times: user 45 µs, sys: 1 µs, total: 46 µs
Wall time: 49.1 µs


In [27]:
# Define network

from torchmf import BPRModule
model = BPRModule(n_users=N_USERS, n_items=N_ITEMS, n_factors=100,
                  dropout_p=0.7)

In [28]:
model

BPRModule(
  (pred_model): BaseModule(
    (user_biases): Embedding(109264, 1)
    (item_biases): Embedding(5606, 1)
    (user_embeddings): Embedding(109264, 100)
    (item_embeddings): Embedding(5606, 100)
    (dropout): Dropout(p=0.7, inplace=False)
  )
)

In [29]:
# Define training class

from training import Step
from torchmf import bpr_loss
net = Step(model=model, actual_items_dct=val_actual_items_dct,
           seen_items_dct=seen_items_dct, loss_function=bpr_loss,
           lr=0.03, weight_decay=0.05, batch_size=BATCH_SIZE,
           num_predictions=3)

In [30]:
train_size, test_size = dev_df.shape[0], val_df.shape[0]
print(train_size, '\t', test_size, '\t', train_size//BATCH_SIZE)

1280377 	 20859 	 2560


In [31]:
import time

start = time.time()
net.batch_fit(train_loader=dev_loader, test_loader=val_loader,
              train_size=train_size, test_size=test_size, calc_mapk=True,
              epochs=2)
print('time taken: %0.2f' % (time.time() - start))

  0%|          | 0/2560 [00:00<?, ?it/s]

Training begins...


2561it [19:04,  1.48it/s]                          
  0%|          | 0/41 [00:00<?, ?it/s]

Validation begins...
validation with mapk


42it [00:23,  1.82it/s]                        
  0%|          | 0/2560 [00:00<?, ?it/s]

{'epoch': 1, 'train_loss': tensor([0.1106]), 'val_mapk': 0.01197724403534845, 'val_loss': tensor(0.1179)}
Training begins...


2561it [28:37,  1.37it/s]                          
  0%|          | 0/41 [00:00<?, ?it/s]

Validation begins...
validation with mapk


42it [00:22,  1.89it/s]                        

{'epoch': 2, 'train_loss': tensor([0.1024]), 'val_mapk': 0.011697588570880674, 'val_loss': tensor(0.1177)}
time taken: 2907.69





In [16]:
net.metrics

[{'epoch': 1,
  'train_loss': tensor([0.1308]),
  'val_mapk': 0.012025184972114354,
  'val_loss': tensor(0.1374)},
 {'epoch': 2,
  'train_loss': tensor([0.1229]),
  'val_mapk': 0.011841411381178387,
  'val_loss': tensor(0.1384)},
 {'epoch': 3,
  'train_loss': tensor([0.1228]),
  'val_mapk': 0.011681608258625372,
  'val_loss': tensor(0.1381)}]

In [33]:
row = dev_df.iloc[0]
row['challenge']

236