## Goal
1. To predict the next 3 challenges the user will be interested in solving

## Methodology
1. A simple feedforward NN with entity embeddings for challenge and user
2. Objective/Loss function - BPR (good for ranking problems)
3. Features - Only user id and challenge
4. Sampling - Combine train & test samples. Create a val sample by keeping aside the last 3 challenges taken by x% of the users in the train sample

In [2]:
import os, sys, json, joblib

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

In [3]:
# GLOBALS
LOCAL_DIR = '/Users/varunn/Documents/'
DATA_DIR = os.path.join(LOCAL_DIR, 'AV_Data')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
INTERIM_DIR = os.path.join(DATA_DIR, 'interim')
CHALLENGE_DATA_FN = os.path.join(TRAIN_DIR, 'challenge_data.csv')
TRAIN_DATA_FN = os.path.join(TRAIN_DIR, 'train.csv')
TEST_DATA_FN = os.path.join(DATA_DIR, 'test.csv')

In [4]:
# read data
df_train = pd.read_csv(TRAIN_DATA_FN)
df_test = pd.read_csv(TEST_DATA_FN)
df_challenge = pd.read_csv(CHALLENGE_DATA_FN)

### Exploration

In [8]:
print('Train\n')
print(df_train.shape)
print(df_train.head())
print('\n')

n_users = df_train['user_id'].nunique()
n_challenges = df_train['challenge'].nunique()
n_challenge_per_user = df_train.groupby('user_id')['challenge'].nunique(
    ).mean()
print('Num Users: %d' % (n_users))
print('Num Challenges: %d' % (n_challenges))
print('Num Challenges per User: %d' % (n_challenge_per_user))

Train

(903916, 4)
  user_sequence  user_id  challenge_sequence challenge
0        4576_1     4576                   1   CI23714
1        4576_2     4576                   2   CI23855
2        4576_3     4576                   3   CI24917
3        4576_4     4576                   4   CI23663
4        4576_5     4576                   5   CI23933


Num Users: 69532
Num Challenges: 5348
Num Challenges per User: 13


In [9]:
print('Test\n')
print(df_test.shape)
print(df_test.head())
print('\n')

n_users = df_test['user_id'].nunique()
n_challenges = df_test['challenge'].nunique()
n_challenge_per_user = df_test.groupby('user_id')['challenge'].nunique(
    ).mean()
print('Num Users: %d' % (n_users))
print('Num Challenges: %d' % (n_challenges))
print('Num Challenges per User: %d' % (n_challenge_per_user))

Test

(397320, 4)
  user_sequence  user_id  challenge_sequence challenge
0        4577_1     4577                   1   CI23855
1        4577_2     4577                   2   CI23933
2        4577_3     4577                   3   CI24917
3        4577_4     4577                   4   CI24915
4        4577_5     4577                   5   CI23714


Num Users: 39732
Num Challenges: 4477
Num Challenges per User: 10


In [19]:
print('check if the users in test are in train')
train_users = set(df_train['user_id'].unique().tolist())
test_users = set(df_test['user_id'].unique().tolist())

print(len(test_users - train_users), '\n')

print('check if the challenges in test are in train')
train_users = set(df_train['challenge'].unique().tolist())
test_users = set(df_test['challenge'].unique().tolist())

print(len(test_users - train_users), '\n')

check if the users in test are in train
39732 

check if the challenges in test are in train
154 



In [18]:
# challenge data
n_challenges = df_challenge['challenge_ID'].nunique()
print('num challenges: %d' % (n_challenges))
df_challenge.head()

num challenges: 5606


Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [24]:
# number of unique values
cols = list(df_challenge.columns)
for col in cols:
    print(col)
    print('num unique values: %d' % (df_challenge[col].nunique()))
    print('\n')

challenge_ID
num unique values: 5606


programming_language
num unique values: 3


challenge_series_ID
num unique values: 435


total_submissions
num unique values: 1067


publish_date
num unique values: 1145


author_ID
num unique values: 3484


author_gender
num unique values: 2


author_org_ID
num unique values: 1717


category_id
num unique values: 194




In [22]:
# number of interactions between a user and a challenge
tmp = df_train.groupby(['user_id', 'challenge'])['user_sequence'].count(
    ).rename('num_times_challenge_taken').reset_index()
print(tmp.shape)
tmp['num_times_challenge_taken'].describe()

(903916, 3)


count    903916.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: num_times_challenge_taken, dtype: float64

### Data Preparation
1. id mapping for categorical variables
2. Normalization for numeric variables

In [30]:
# GLOBALS
CATEGORICAL_VARS = ['user_id', 'challenge_sequence', 'challenge',
                    'programming_language', 'challenge_series_ID',
                    'author_ID', 'author_gender', 'author_org_ID',
                    'category_id']
NUMERIC_VARS = ['total_submissions']

In [37]:
# append train and test data to create a unified dataset
cols = ['user_sequence', 'user_id', 'challenge_sequence', 'challenge',
        'sample']
df_train['sample'] = 'train'
df_test['sample'] = 'test'
df = pd.concat([df_train[cols], df_test[cols]], axis=0)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

print(df.shape)

(1301236, 5)


In [38]:
# id mapping for categorical variables
df_challenge.rename(columns={'challenge_ID': 'challenge'}, inplace=True)

for col in CATEGORICAL_VARS:
    print('Col: %s' % (col))
    if col in ('user_id', 'challenge_sequence'):
        values = df[col].unique().tolist()
    else:
        values = df_challenge[col].unique().tolist()
    value2idx = {value: idx for idx, value in enumerate(values)}
    idx2value = {idx: value for idx, value in enumerate(values)}
    
    print(len(value2idx), '\t', len(idx2value))
    
    print('save')
    value2idx_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    idx2value_fn = os.path.join(INTERIM_DIR, 'idx2{}.json'.format(col))
    json.dump(value2idx, open(value2idx_fn, 'w'))
    json.dump(idx2value, open(idx2value_fn, 'w'))
    print('\n')

Col: user_id
109264 	 109264
save


Col: challenge_sequence
13 	 13
save


Col: challenge
5606 	 5606
save


Col: programming_language
3 	 3
save


Col: challenge_series_ID
436 	 436
save


Col: author_ID
3485 	 3485
save


Col: author_gender
3 	 3
save


Col: author_org_ID
1718 	 1718
save


Col: category_id
195 	 195
save




### Sampling for baseline model

In [42]:
# GLOBALS
TEST_USER_PROPORTION = 0.1
SEED = 1

In [40]:
# mapping discrete cols to mapped id cols

baseline_df = df.copy()

for col in ['user_id', 'challenge']:
    print('Col: %s' % (col))
    value2idx_fn = os.path.join(INTERIM_DIR, '{}2idx.json'.format(col))
    d = json.load(open(value2idx_fn))
    baseline_df[col] = baseline_df[col].apply(lambda x: d[str(x)])

Col: user_id
Col: challenge


In [43]:
# sampling

mask = baseline_df['sample'] == 'train'
users = baseline_df.loc[mask, 'user_id'].unique().tolist()
np.random.seed(SEED)
users = list(np.random.permutation(users))
num_val_users = int(TEST_USER_PROPORTION*len(users))
val_users = users[:num_val_users]

mask1 = baseline_df['user_id'].isin(val_users)
mask2 = baseline_df['challenge_sequence'].isin([11, 12, 13])
val_df = baseline_df.loc[mask1&mask2, :]
val_df.reset_index(drop=True, inplace=True)
dev_df = baseline_df.loc[~(mask1&mask2), :]
dev_df.reset_index(drop=True, inplace=True)

print(dev_df.shape, '\t', val_df.shape)
print(dev_df['user_id'].nunique(), '\t', val_df['user_id'].nunique())

(1280377, 5) 	 (20859, 5)
109264 	 6953


In [52]:
# get actual items dct on val sample for mapk calculation
tmp = val_df.groupby('user_id')['challenge'].apply(list).rename(
    'challenge_lst').reset_index()
val_actual_items_dct = dict(zip(tmp['user_id'], tmp['challenge_lst']))
print(len(val_actual_items_dct))
del tmp

6953


In [54]:
# get seen items dct for mapk calculation
tmp = dev_df.groupby('user_id')['challenge'].apply(list).rename(
    'challenge_lst').reset_index()
seen_items_dct = dict(zip(tmp['user_id'], tmp['challenge_lst']))
print(len(seen_items_dct))
del tmp

109264


In [53]:
# save
dev_fn = os.path.join(INTERIM_DIR, 'baseline_dev_df.csv')
val_fn = os.path.join(INTERIM_DIR, 'baseline_val_df.csv')
val_actual_items_dct_fn = os.path.join(INTERIM_DIR,
                                       'val_actual_items_dct.json')
seen_items_dct_fn = os.path.join(INTERIM_DIR, 'seen_items_dct.json')
dev_df.to_csv(dev_fn, index=False)
val_df.to_csv(val_fn, index=False)
json.dump(val_actual_items_dct, open(val_actual_items_dct_fn, 'w'))
json.dump(seen_items_dct, open(seen_items_dct_fn, 'w'))