# Create Submission File

In [1]:
import polars as pl

test_df = pl.read_parquet('../input/comp9417-preprocessing-and-test-data/test_df.parquet')

In [2]:
id2type = {0: 'clicks', 1: 'carts', 2: 'orders'}
type2id = {'clicks': 0, 'carts': 1, 'orders': 2}

In [3]:
# Get test data
import polars as pl
import numpy as np
import pandas as pd

from pathlib import Path

data_path = Path('/kaggle/input/otto-recommender-system/')

test_sessions = pd.DataFrame()
chunks = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        test_sessions = pd.concat([test_sessions, chunk_session])
    else:
        break
        

test_sessions = pl.from_pandas(test_sessions.reset_index(drop=True))
test_sessions = test_sessions.groupby('session').agg(pl.all()).sort(by='session')

In [4]:
import pickle

def load_model(name):
    with open(name, 'rb') as file:  
        model = pickle.load(file)
        
    return model

In [5]:
def get_df(model_long, model_short):
    event_types = ['click', 'cart', 'order']
    models = []
    for event in event_types:
        models.append(load_model(f'../input/comp9417-training-{model_long}/{model_short}_{event}.pkl'))

    model_click = models[0]
    model_cart = models[1]
    model_order = models[2]

    click_scores = model_click.predict(test_df.to_pandas())
    cart_scores = model_cart.predict(test_df.to_pandas())
    order_scores = model_order.predict(test_df.to_pandas())

    pred_df = test_df.with_columns(click_scores = pl.lit(click_scores), cart_scores = pl.lit(cart_scores), order_scores = pl.lit(order_scores))
    
    click_preds = pred_df.groupby('session').agg(pl.col('aid').sort_by('click_scores', descending=True).slice(0,20))
    cart_preds = pred_df.groupby('session').agg(pl.col('aid').sort_by('cart_scores', descending=True).slice(0,20))
    order_preds = pred_df.groupby('session').agg(pl.col('aid').sort_by('order_scores', descending=True).slice(0,20))

    click_new = click_preds.with_columns(
        pl.lit('clicks').alias('type'),
        (pl.col('session').cast(pl.Utf8) + '_clicks').alias('session_type'),
        pl.col('aid').cast(pl.List(pl.Utf8))
    ).rename({'aid':'labels'}).drop('session').drop('type')
    cart_new = cart_preds.with_columns(
        pl.lit('carts').alias('type'),
        (pl.col('session').cast(pl.Utf8) + '_carts').alias('session_type'),
        pl.col('aid').cast(pl.List(pl.Utf8))
    ).rename({'aid':'labels'}).drop('session').drop('type')
    order_new = order_preds.with_columns(
        pl.lit('orders').alias('type'),
        (pl.col('session').cast(pl.Utf8) + '_orders').alias('session_type'),
        pl.col('aid').cast(pl.List(pl.Utf8))
    ).rename({'aid':'labels'}).drop('session').drop('type')

    preds = pl.concat([click_new, cart_new, order_new]).to_pandas()
    print(preds)
    preds['labels'] = preds['labels'].apply(lambda x: ' '.join(x))
    preds = preds[['session_type', 'labels']]
    preds = preds.sort_values(by='session_type')

    print(preds)
    return preds

In [6]:
# output
preds = get_df('lightgbm', 'lgbm')
preds.to_csv('/kaggle/working/lgbm_submission.csv', index=False)  



                                                   labels     session_type
0                                      [1236775, 1500315]  12973588_clicks
1                                               [1102222]  12906676_clicks
2                                               [1187898]  12927668_clicks
3                                                [475026]  12928824_clicks
4       [109279, 215743, 333489, 616967, 1699076, 8116...  12976864_clicks
...                                                   ...              ...
599995                                          [1300613]  13002683_orders
599996                                          [1069472]  13038687_orders
599997  [263827, 458127, 591254, 1798365, 1440544, 495...  13077239_orders
599998                                          [1305957]  13080071_orders
599999  [420907, 297694, 452981, 535399, 1277541, 1058...  12911763_orders

[600000 rows x 2 columns]
           session_type                                             label

In [7]:
preds = get_df('xgboost', 'xgb')
preds.to_csv('/kaggle/working/xgb_submission.csv', index=False)  

                                                   labels     session_type
0                                               [1071714]  13087764_clicks
1       [206418, 206418, 390163, 274783, 1338481, 1593...  13046712_clicks
2                                               [1140474]  13025652_clicks
3                                                [549503]  12925728_clicks
4                                       [1005261, 429620]  13034928_clicks
...                                                   ...              ...
599995                                 [1487224, 1282212]  12926843_orders
599996  [142737, 1289500, 1568011, 1389374, 444620, 21...  13041223_orders
599997                                          [1200369]  13060791_orders
599998                         [1678311, 1658167, 133083]  13089863_orders
599999                                          [1319345]  12917531_orders

[600000 rows x 2 columns]
           session_type                                             label