In order to facilitate your work with the data we provide several dumb benchmarks in this notebook.

In [6]:
import numpy as np 
import pandas as pd
import scipy.sparse
import matplotlib.pyplot as plt
%matplotlib inline

# Custom metric is implemented here
#from scorer import scorer
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression

In [7]:
#!pip install tqdm

In [8]:
#!pip install joblib

# Load data

In [9]:
# Use custom dtypes for efficiency
dtypes = {'id1': np.int16, 'id2': np.int16, 'id3': np.int16, 'user_id': np.int32, 'date': np.int16}

train = pd.read_csv('train.csv', dtype=dtypes)
train.head(5)

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
1,714,915655,34,1,4
2,316,262696,42,1,2
3,52,354280,4,1,10
4,581,218912,14,1,10


# Simulate train/test split

Select last 7 days to be validation set

In [10]:
date_validation_start = train.date.max() - 6
print (date_validation_start)

48


In [11]:
def calculate_target(data, date_test_start):
    '''
        This function returns a dictionary of type {user: items_list}
        Such that user viewed an item in testing period, 
        but did not view it within the last 3 weeks of train period.
    '''
    
    test_mask = (data.date >= date_test_start) & (data.date < date_test_start + 7)
    last_3weeks_mask = (data.date >= date_test_start - 21 + 1) & (data.date < date_test_start)
    
    # Items that used viewed during test period
    items_test = data[test_mask].groupby('user_id').id3.apply(set)
    
    # Items, that user viewd in last 3 weeks
    user_last_3weeks = data[last_3weeks_mask].groupby('user_id').id3.apply(set)
    
    # Get table, where for each `user_id` we have both items from test period and 3 weeks
    joined = items_test.reset_index().merge(user_last_3weeks.reset_index(), on=['user_id'], how='left')
    joined.set_index('user_id', inplace=True)
    
    # Remove the items, which the user viewed during last 3 weeks 
    target = {}
    for user_id, (id3_x, id3_y) in joined.iterrows():   
        items = id3_x if id3_y is np.nan else id3_x - id3_y
        if items != set(): target.update({user_id: items})

    return target

# This function may take several minutes to finish
y_val_dict = calculate_target(train, date_validation_start)

In [12]:
y_val_dict

{2: {142, 481, 528},
 5: {269, 630},
 7: {129, 581},
 9: {97, 134, 218, 872, 911},
 10: {185, 581, 619, 875},
 11: {271, 749, 908},
 14: {224, 650},
 18: {176, 377},
 23: {901},
 24: {547, 569, 695},
 27: {20, 125, 277, 447, 451, 481, 545, 775, 853, 876, 920},
 28: {152},
 30: {51,
  58,
  76,
  77,
  81,
  88,
  93,
  105,
  109,
  134,
  136,
  137,
  151,
  195,
  222,
  223,
  295,
  296,
  318,
  366,
  385,
  392,
  393,
  419,
  571,
  572,
  587,
  604,
  637,
  640,
  645,
  662,
  685,
  712,
  750,
  755,
  760,
  773,
  798,
  800,
  817,
  841,
  843,
  875,
  890,
  903,
  904,
  908},
 31: {581, 685, 875},
 32: {700},
 33: {38, 190, 735},
 36: {237},
 41: {92, 145},
 45: {41, 145, 204, 224, 328, 610, 648, 725, 883},
 51: {93,
  129,
  215,
  274,
  283,
  398,
  425,
  441,
  594,
  610,
  612,
  662,
  746,
  788,
  817},
 53: {286},
 54: {410, 660},
 56: {263, 669, 725},
 58: {113},
 59: {753},
 62: {221, 545, 740},
 64: {134},
 66: {69, 92, 132, 320, 488, 581, 610, 86

# Benchmarks

### Random guess

The most simple benchmark: select 5% users at random and assign them items randomly.

In [15]:
ids = train.id3.unique()
users = train.user_id[train.date < date_validation_start].unique()
num_users = len(users)

# Select random users
users_random_subset = np.random.choice(users, int(np.ceil(num_users * .05)), replace=False)
 
# Select 5 random items for each user 
y_pred_dict = {user: np.random.choice(ids, 5) for user in users_random_subset}

# Compute score 
#score = scorer(y_val_dict, y_pred_dict, num_users)
#print ("Random benchmark's score: %f" % score)

### ML benchmark

We will create a simple benchmark using some machine learning. 

In [16]:
mask_train = train.date < date_validation_start #обучают на всех данных
mask_test = (train.date < date_validation_start) & (train.date >= train.date.min() + 7) #последняя неделя

# For the sake of speed select only first 10k users to train on
users_mask = train.user_id < 10000
mask_train = mask_train & users_mask

In [17]:
def get_feats(data):
    '''
        Builds sparse matrix using users' history.
    '''
    return scipy.sparse.coo_matrix(([1] * data.shape[0], (data.user_id, data.id3)), 
                                    shape =[data.user_id.max()+1, data.id3.max()+1]).tocsr()

def get_target_matrix(X, target_dict):
    '''
        Builds sparse matrix using dictionary.
    '''
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}

    ks = []
    for k in tqdm(range(X.user_id.max()+1)):
        d = target_dict.get(k, [])
        for y in d:
            indices.append(y)
            data.append(1)
        indptr.append(len(indices))

    return scipy.sparse.csr_matrix((data, indices, indptr), dtype=int, shape =[X.user_id.max()+1, X.id3.max()+1])

In [18]:
# For each user count how many items he viewed
X_train = get_feats(train.loc[mask_train])
X_test = get_feats(train.loc[mask_test])

y_train_dict = calculate_target(train.loc[users_mask], date_validation_start - 7)
y_train = get_target_matrix(train.loc[mask_train], y_train_dict)
y_test = get_target_matrix(train.loc[mask_test], y_val_dict)

100%|██████████| 10000/10000 [00:00<00:00, 931115.75it/s]
100%|██████████| 1179572/1179572 [00:01<00:00, 1152622.00it/s]


In [20]:
X_train[15].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0

In [21]:
y_train[0].todense().sum()

2

In [22]:
target = y_train[:, 610].toarray().ravel()
print (len(target), target)

10000 [0 0 0 ..., 0 0 0]


In [23]:
np.zeros((X_test.shape[0], )) - 1 

array([-1., -1., -1., ..., -1., -1., -1.])

#### For every id3 fit a separate Logistic Regression model

In [24]:
def fit(i):
    target = y_train[:, i].toarray().ravel()
    
    if target.mean() == 0:
        return np.zeros((X_test.shape[0], )) - 1 
    
    d = LogisticRegression(max_iter=10)
    d.fit(X_train, target)
    return (d.predict_proba(X_test)[:, 1])

preds = Parallel(n_jobs = 8, verbose=50)(delayed(fit)(i) for i in range(y_train.shape[1]))

preds = np.vstack(preds).T

# To reduce memory usage
preds = preds.astype(np.float16)

[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Batch computation too fast (0.0751s.) Setting batch_size=4.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed:    1.2s
[Pa

[Parallel(n_jobs=8)]: Done 213 tasks      | elapsed:   11.3s
[Parallel(n_jobs=8)]: Done 214 tasks      | elapsed:   11.3s
[Parallel(n_jobs=8)]: Done 215 tasks      | elapsed:   11.3s
[Parallel(n_jobs=8)]: Done 216 tasks      | elapsed:   11.4s
[Parallel(n_jobs=8)]: Done 217 tasks      | elapsed:   11.5s
[Parallel(n_jobs=8)]: Done 218 tasks      | elapsed:   11.5s
[Parallel(n_jobs=8)]: Done 219 tasks      | elapsed:   11.5s
[Parallel(n_jobs=8)]: Done 220 tasks      | elapsed:   11.6s
[Parallel(n_jobs=8)]: Done 221 tasks      | elapsed:   11.6s
[Parallel(n_jobs=8)]: Done 222 tasks      | elapsed:   11.7s
[Parallel(n_jobs=8)]: Done 223 tasks      | elapsed:   11.7s
[Parallel(n_jobs=8)]: Done 224 tasks      | elapsed:   11.7s
[Parallel(n_jobs=8)]: Done 225 tasks      | elapsed:   11.7s
[Parallel(n_jobs=8)]: Done 226 tasks      | elapsed:   11.8s
[Parallel(n_jobs=8)]: Done 227 tasks      | elapsed:   11.8s
[Parallel(n_jobs=8)]: Done 228 tasks      | elapsed:   11.9s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 351 tasks      | elapsed:   18.1s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:   18.1s
[Parallel(n_jobs=8)]: Done 353 tasks      | elapsed:   18.2s
[Parallel(n_jobs=8)]: Done 354 tasks      | elapsed:   18.2s
[Parallel(n_jobs=8)]: Done 355 tasks      | elapsed:   18.3s
[Parallel(n_jobs=8)]: Done 356 tasks      | elapsed:   18.3s
[Parallel(n_jobs=8)]: Done 357 tasks      | elapsed:   18.5s
[Parallel(n_jobs=8)]: Done 358 tasks      | elapsed:   18.5s
[Parallel(n_jobs=8)]: Done 359 tasks      | elapsed:   18.5s
[Parallel(n_jobs=8)]: Done 360 tasks      | elapsed:   18.6s
[Parallel(n_jobs=8)]: Done 361 tasks      | elapsed:   18.6s
[Parallel(n_jobs=8)]: Done 362 tasks      | elapsed:   18.7s
[Parallel(n_jobs=8)]: Done 363 tasks      | elapsed:   18.7s
[Parallel(n_jobs=8)]: Done 364 tasks      | elapsed:   18.8s
[Parallel(n_jobs=8)]: Done 365 tasks      | elapsed:   19.0s
[Parallel(n_jobs=8)]: Done 366 tasks      | elapsed:   19.0s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 486 tasks      | elapsed:   24.9s
[Parallel(n_jobs=8)]: Done 487 tasks      | elapsed:   24.9s
[Parallel(n_jobs=8)]: Done 488 tasks      | elapsed:   25.0s
[Parallel(n_jobs=8)]: Done 489 tasks      | elapsed:   25.0s
[Parallel(n_jobs=8)]: Done 490 tasks      | elapsed:   25.1s
[Parallel(n_jobs=8)]: Done 491 tasks      | elapsed:   25.1s
[Parallel(n_jobs=8)]: Done 492 tasks      | elapsed:   25.1s
[Parallel(n_jobs=8)]: Done 493 tasks      | elapsed:   25.2s
[Parallel(n_jobs=8)]: Done 494 tasks      | elapsed:   25.2s
[Parallel(n_jobs=8)]: Done 495 tasks      | elapsed:   25.2s
[Parallel(n_jobs=8)]: Done 496 tasks      | elapsed:   25.3s
[Parallel(n_jobs=8)]: Done 497 tasks      | elapsed:   25.5s
[Parallel(n_jobs=8)]: Done 498 tasks      | elapsed:   25.5s
[Parallel(n_jobs=8)]: Done 499 tasks      | elapsed:   25.5s
[Parallel(n_jobs=8)]: Done 500 tasks      | elapsed:   25.6s
[Parallel(n_jobs=8)]: Done 501 tasks      | elapsed:   25.6s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 622 tasks      | elapsed:   31.6s
[Parallel(n_jobs=8)]: Done 623 tasks      | elapsed:   31.7s
[Parallel(n_jobs=8)]: Done 624 tasks      | elapsed:   31.8s
[Parallel(n_jobs=8)]: Done 625 tasks      | elapsed:   31.8s
[Parallel(n_jobs=8)]: Done 626 tasks      | elapsed:   31.9s
[Parallel(n_jobs=8)]: Done 627 tasks      | elapsed:   31.9s
[Parallel(n_jobs=8)]: Done 628 tasks      | elapsed:   31.9s
[Parallel(n_jobs=8)]: Done 629 tasks      | elapsed:   32.0s
[Parallel(n_jobs=8)]: Done 630 tasks      | elapsed:   32.0s
[Parallel(n_jobs=8)]: Done 631 tasks      | elapsed:   32.0s
[Parallel(n_jobs=8)]: Done 632 tasks      | elapsed:   32.1s
[Parallel(n_jobs=8)]: Done 633 tasks      | elapsed:   32.2s
[Parallel(n_jobs=8)]: Done 634 tasks      | elapsed:   32.2s
[Parallel(n_jobs=8)]: Done 635 tasks      | elapsed:   32.3s
[Parallel(n_jobs=8)]: Done 636 tasks      | elapsed:   32.3s
[Parallel(n_jobs=8)]: Done 637 tasks      | elapsed:   32.3s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 757 tasks      | elapsed:   38.6s
[Parallel(n_jobs=8)]: Done 758 tasks      | elapsed:   38.6s
[Parallel(n_jobs=8)]: Done 759 tasks      | elapsed:   38.6s
[Parallel(n_jobs=8)]: Done 760 tasks      | elapsed:   38.7s
[Parallel(n_jobs=8)]: Done 761 tasks      | elapsed:   38.7s
[Parallel(n_jobs=8)]: Done 762 tasks      | elapsed:   38.8s
[Parallel(n_jobs=8)]: Done 763 tasks      | elapsed:   38.8s
[Parallel(n_jobs=8)]: Done 764 tasks      | elapsed:   38.8s
[Parallel(n_jobs=8)]: Done 765 tasks      | elapsed:   38.9s
[Parallel(n_jobs=8)]: Done 766 tasks      | elapsed:   38.9s
[Parallel(n_jobs=8)]: Done 767 tasks      | elapsed:   39.0s
[Parallel(n_jobs=8)]: Done 768 tasks      | elapsed:   39.0s
[Parallel(n_jobs=8)]: Done 769 tasks      | elapsed:   39.1s
[Parallel(n_jobs=8)]: Done 770 tasks      | elapsed:   39.1s
[Parallel(n_jobs=8)]: Done 771 tasks      | elapsed:   39.1s
[Parallel(n_jobs=8)]: Done 772 tasks      | elapsed:   39.2s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 893 tasks      | elapsed:   44.7s
[Parallel(n_jobs=8)]: Done 894 tasks      | elapsed:   44.8s
[Parallel(n_jobs=8)]: Done 895 tasks      | elapsed:   44.8s
[Parallel(n_jobs=8)]: Done 896 tasks      | elapsed:   44.8s
[Parallel(n_jobs=8)]: Done 897 tasks      | elapsed:   44.9s
[Parallel(n_jobs=8)]: Done 898 tasks      | elapsed:   45.0s
[Parallel(n_jobs=8)]: Done 899 tasks      | elapsed:   45.1s
[Parallel(n_jobs=8)]: Done 900 tasks      | elapsed:   45.1s
[Parallel(n_jobs=8)]: Done 901 tasks      | elapsed:   45.1s
[Parallel(n_jobs=8)]: Done 902 tasks      | elapsed:   45.2s
[Parallel(n_jobs=8)]: Done 903 tasks      | elapsed:   45.2s
[Parallel(n_jobs=8)]: Done 904 tasks      | elapsed:   45.3s
[Parallel(n_jobs=8)]: Done 905 tasks      | elapsed:   45.3s
[Parallel(n_jobs=8)]: Done 906 tasks      | elapsed:   45.3s
[Parallel(n_jobs=8)]: Done 907 tasks      | elapsed:   45.4s
[Parallel(n_jobs=8)]: Done 908 tasks      | elapsed:   45.5s
[Parallel(n_jobs=8)]: Do

In [25]:
print (preds)

[[  7.36117363e-05   6.48498535e-05  -1.00000000e+00 ...,   1.61743164e-03
    3.04460526e-04   1.61767006e-04]
 [  9.93728638e-04   9.51290131e-04  -1.00000000e+00 ...,   7.27176666e-04
    9.55581665e-04   1.05953217e-03]
 [  1.58309937e-03   1.48200989e-03  -1.00000000e+00 ...,   1.14536285e-03
    8.95977020e-04   1.24549866e-03]
 ..., 
 [  1.56688690e-03   1.59454346e-03  -1.00000000e+00 ...,   9.47952271e-04
    9.46998596e-04   1.08528137e-03]
 [  8.29696655e-04   3.74555588e-04  -1.00000000e+00 ...,   8.95500183e-04
    3.67164612e-04   6.68525696e-04]
 [  1.30462646e-03   1.21402740e-03  -1.00000000e+00 ...,   1.03950500e-03
    8.19683075e-04   1.27601624e-03]]


#### Get item predictions

In [26]:
num = int(np.ceil(num_users * 0.05))

# Let's take not random users, but the ones who viewed a lot 
users = train.loc[mask_test].user_id.value_counts().index[:num]
ans_inds =  np.argsort(preds[users])

In [None]:
test_inds_dict =  {k: list(ans_inds[i, -5:]) for i,k in enumerate(users)}
scorer(y_val_dict, test_inds_dict, num_users=num_users)

#### Try to remove the id's, that user saw during last 3 weeks 

In [None]:
# For each user find the categories, which we do not want to predict
last_3weeks = train.loc[mask_test].loc[train.loc[mask_test].date >= train.loc[mask_test].date.max() - 21 + 1]
y_not = last_3weeks.groupby('user_id').id3.apply(set)

In [None]:
y_pred = {}

for u_idx, user in tqdm(enumerate(users)):
    items_not = y_not.get(user, [])
    items_pred = []
    i = 1
    while len(items_pred) < 5:
        if not ans_inds[u_idx, -i] in items_not:
            items_pred += [ans_inds[u_idx, -i]]
    
        i += 1
    y_pred.update({user: items_pred})
    
print scorer(y_val_dict, y_pred, num_users)

These are just very very dumb and simplistic benchmarks. It is possible to do much better on this task. Good luck!

Finally, here is a snippet that will convert `y_pred` to (compressed) `.csv` file:

In [None]:
y_pred_df = pd.DataFrame.from_records(y_pred).T.reset_index()
y_pred_df.columns = ['user_id', 'id3_1', 'id3_2', 'id3_3', 'id3_4', 'id3_5']

y_pred_df.to_csv('y_pred.csv', index=False)

!rm y_pred.csv.zip; zip y_pred.csv.zip y_pred.csv

<a href="./y_pred.csv.zip">Link to download the submission from browser</a>