In [None]:
# install cuml

import sys
!cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
# Get test data
import numpy as np
import pandas as pd
import polars as pl

from pathlib import Path

data_path = Path('/kaggle/input/recsys-dataset/')

test_sessions = pd.DataFrame()
chunks = pd.read_json(data_path / 'otto-recsys-test.jsonl', lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        test_sessions = pd.concat([test_sessions, chunk_session])
    else:
        break
        

test_sessions = pl.from_pandas(test_sessions.reset_index(drop=True))
test_sessions = test_sessions.groupby('session').agg(pl.all()).sort(by='session')

# Split test data into testA (session up to certain point) and testB (prediction)
dictA = {'session': [], 'aid': [], 'ts': [], 'type': []}
dictB = {'session': [], 'aid': [], 'ts': [], 'type': []}

for row in test_sessions.iter_rows():
    split_idx = np.random.randint(1,len(row[1]))
    dictA['session'].append(row[0])
    dictA['aid'].append(row[1][:split_idx])
    dictA['ts'].append(row[2][:split_idx])
    dictA['type'].append(row[3][:split_idx])

    dictB['session'].append(row[0])
    dictB['aid'].append(row[1][split_idx:])
    dictB['ts'].append(row[2][split_idx:])
    dictB['type'].append(row[3][split_idx:])
    
testA = pl.DataFrame(data=dictA).head(100)
testB = pl.DataFrame(data=dictB)

actual_events = testB.head(100).explode(['aid', 'ts', 'type'])

shape: (1_323, 4)
┌──────────┬─────────┬───────────────┬────────┐
│ session  ┆ aid     ┆ ts            ┆ type   │
│ ---      ┆ ---     ┆ ---           ┆ ---    │
│ i64      ┆ i64     ┆ i64           ┆ str    │
╞══════════╪═════════╪═══════════════╪════════╡
│ 12899779 ┆ 875854  ┆ 1661724026702 ┆ clicks │
│ 12899780 ┆ 260305  ┆ 1661724221170 ┆ clicks │
│ 12899781 ┆ 199008  ┆ 1662060064706 ┆ clicks │
│ 12899781 ┆ 918667  ┆ 1662060160406 ┆ clicks │
│ …        ┆ …       ┆ …             ┆ …      │
│ 12899876 ┆ 1807283 ┆ 1661725293874 ┆ orders │
│ 12899876 ┆ 1055835 ┆ 1661725293874 ┆ orders │
│ 12899877 ┆ 904917  ┆ 1661724304410 ┆ clicks │
│ 12899878 ┆ 1487095 ┆ 1661724187890 ┆ clicks │
└──────────┴─────────┴───────────────┴────────┘
shape: (200_000, 4)
┌──────────┬──────────────────────────────┬───────────────────┬────────────────────────────────────┐
│ session  ┆ aid                          ┆ ts                ┆ type                               │
│ ---      ┆ ---                        

In [3]:

train = pl.read_parquet('../input/otto-train-and-test-data-for-local-validation/train.parquet')

# Get subset of data 
fraction_of_sessions = 1

train_sessions = train['session'].sample(fraction=fraction_of_sessions, seed=42)
train = train.filter(pl.col("session").is_in(train_sessions))
train = train.sort("session")


In [4]:
# decrease aid amount
# simple method based on counts
def top_n_interacted_aids(df, n):
    return df['aid'].value_counts().head(n)['aid'].to_list()
    
candidates = top_n_interacted_aids(train, 500000)
candidate_train = train.filter(pl.col('aid').is_in(candidates))


500000


In [5]:
# interaction matrix

# count click,cart,buy for each (session, aid) pair
counts = candidate_train.groupby(['session', 'aid', 'type']) \
       .agg(click_count = pl.col('type').filter(pl.col('type') == 0).count(),
            cart_count = pl.col('type').filter(pl.col('type') == 1).count(),
            buy_count = pl.col('type').filter(pl.col('type') == 2).count())

# calculate a value based on counts
counts = counts.with_columns(value = 1 * pl.col('click_count') + 10 * pl.col('cart_count') + 50 * pl.col('buy_count'))
counts = counts.with_columns(counts['value'].cast(pl.datatypes.UInt16)) \
       .drop(['type', 'click_count', 'cart_count', 'buy_count'])
counts = counts.sort(by='value')

# convert to sparse matrix

from scipy.sparse import coo_matrix

row = counts.get_column('session').to_numpy()
col = counts.get_column('aid').to_numpy()
data = counts.get_column('value').to_numpy().astype(np.float32)
interaction_matrix = coo_matrix((data, (row, col)))
aid_dimension = interaction_matrix.shape[1]


shape: (31_440_193, 3)
┌─────────┬─────────┬───────┐
│ session ┆ aid     ┆ value │
│ ---     ┆ ---     ┆ ---   │
│ i32     ┆ i32     ┆ u16   │
╞═════════╪═════════╪═══════╡
│ 0       ┆ 1152674 ┆ 1     │
│ 0       ┆ 1841388 ┆ 1     │
│ 0       ┆ 1604396 ┆ 1     │
│ 0       ┆ 1407538 ┆ 1     │
│ …       ┆ …       ┆ …     │
│ 3348204 ┆ 877444  ┆ 1300  │
│ 3779040 ┆ 1032810 ┆ 1370  │
│ 5895467 ┆ 309666  ┆ 1860  │
│ 48627   ┆ 1672890 ┆ 3840  │
└─────────┴─────────┴───────┘
3840.0




In [10]:
import cuml
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=50000, metric='manhattan')
interaction_matrix = interaction_matrix.tocsr()
model.fit(interaction_matrix)

def predict_20(user, csr_mat, model):
    # get n nearest users
    distances, neighbour_indices = model.kneighbors(user)

    # get items from neighbours that user has not interacted with
    neighbour_aids = dict()
    for idx in neighbour_indices[0]:
        row = csr_mat.getrow(idx)

        for i, idx in enumerate(row.indices):
            if idx in neighbour_aids:
                neighbour_aids[idx] += row.data[i]
            else:
                neighbour_aids[idx] = row.data[i]
    
    top_aids = sorted(neighbour_aids, key=lambda x: neighbour_aids[x], reverse=True)
    return top_aids[:20]


In [13]:
# hyperparameter tuning

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

def average_distance_metric(estimator, X):
    # Fit the estimator on the data and get the distances and indices of the nearest neighbors
    estimator.fit(X)
    distances, _ = estimator.kneighbors(X)
    
    # Calculate the average distance of the nearest neighbors
    avg_distance = np.mean(np.mean(distances, axis=1))
    return avg_distance

n_neighbors = [10, 100, 1000, 5000, 10000, 50000]
metrics = ['cosine', 'manhattan', 'euclidean']
best_score = np.inf
best_n_neighbors = None
best_metric = None

k_folds = 5
for n in n_neighbors:
    for metric in metrics:
        nn_model = NearestNeighbors(n_neighbors=n, metric=metric)
        nn_model.fit(interaction_matrix)
        
        scores = cross_val_score(model, interaction_matrix, cv=k_folds, scoring=‘neg_mean_squared_error’)
        average_distance = np.mean(scores)
     
        distances, _ = nn_model.kneighbors(interaction_matrix[:100, :])
        average_distance = np.mean(np.mean(distances, axis=1))
        
        print(average_distance)
        
        if average_distance < best_score:
            best_score = average_distance
            best_n_neighbors = n
            best_metric = metric


0.9858172
30.78375196282546
64.9861679


In [11]:
# predicting test

action_weights = {'clicks': 1, 'carts': 10, 'orders': 50}

sessions = []
pred = []
types = [['clicks', 'carts', 'orders'] for _ in range(100)]

# predict for each user in test set
for user in testA.rows(named=True):
    # calculate user - aid interaction value
    values = dict()
    for aid, action in zip(user['aid'], user['type']):
        if aid in values:
            values[aid] += action_weights[action]
        else:
            values[aid] = action_weights[action]
            
    col = []
    data = []
    for aid, value in values.items():
        col.append(aid)
        data.append(value)
    row = [0 for _ in range(len(col))]
    
    coo_row = coo_matrix((data, (row, col)), shape=(1, aid_dimension))
    recommendations = predict_20(coo_row, interaction_matrix, model)
    
    sessions.append(user['session'])
    pred.append(recommendations)

# prediction df for evaluation
pred = pl.DataFrame({'session': sessions, 'pred_labels': pred, 'type': types})
pred = pred.explode('type')


shape: (100, 4)
┌──────────┬─────────────────────────────────┬───────────────────┬─────────────────────────────────┐
│ session  ┆ aid                             ┆ ts                ┆ type                            │
│ ---      ┆ ---                             ┆ ---               ┆ ---                             │
│ i64      ┆ list[i64]                       ┆ list[i64]         ┆ list[str]                       │
╞══════════╪═════════════════════════════════╪═══════════════════╪═════════════════════════════════╡
│ 12899779 ┆ [59625]                         ┆ [1661724000278]   ┆ ["clicks"]                      │
│ 12899780 ┆ [1142000, 582732, … 1142000]    ┆ [1661724000378,   ┆ ["clicks", "clicks", …          │
│          ┆                                 ┆ 1661724058352, …… ┆ "clicks"]                       │
│ 12899781 ┆ [141736, 199008, … 199008]      ┆ [1661724000559,   ┆ ["clicks", "clicks", …          │
│          ┆                                 ┆ 1661724022851, …… ┆ "clicks"

In [12]:
# evaluate score
preds = pred

id2type = {0: 'clicks', 1: 'carts', 2: 'orders'}
type2id = {'clicks': 0, 'carts': 1, 'orders': 2}

gt = actual_events.groupby(['session', 'type']).agg(pl.col('aid')).rename({'aid': 'gt_labels'}).sort(by='session').with_columns(pl.col('type'))
gt = gt.to_pandas()
gt.loc[gt.type == 'clicks', 'gt_labels'] = gt.loc[gt.type == 'clicks', 'gt_labels'].str[:1]
gt = pl.from_pandas(gt)

preds_and_gt = gt.join(preds, how='left', on=['session', 'type']).with_columns(
    hits = pl.col('pred_labels').list.intersection('gt_labels').list.lengths(),
    gt_count = pl.col('gt_labels').list.lengths()
)

preds_and_gt = preds_and_gt.to_pandas()
preds_and_gt.loc[preds_and_gt.type == 'carts', 'gt_labels'] = preds_and_gt.loc[preds_and_gt.type == 'carts', 'gt_labels'].str[:20]
preds_and_gt.loc[preds_and_gt.type == 'orders', 'gt_labels'] = preds_and_gt.loc[preds_and_gt.type == 'orders', 'gt_labels'].str[:20]
preds_and_gt = pl.from_pandas(preds_and_gt)
preds_and_gt = preds_and_gt.with_columns(
    gt_count = pl.col('gt_labels').list.lengths()
)

recall_per_type = preds_and_gt.groupby('type').agg(recall = pl.col('hits').sum() / pl.col('gt_count').sum())
local_validation_score = 0
weights = {'clicks': 0.1, 'carts': 0.3, 'orders': 0.6}
for row in recall_per_type.rows(named=True):
    local_validation_score += row['recall'] * weights[row['type']]

print(f': {local_validation_score}')

: 0.06737070707070707
