### Step 1 - Generate Candidates

https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565

### Step 2 - LGBMRanker

https://www.kaggle.com/code/radek1/polars-proof-of-concept-lgbm-ranker

In [1]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# personal_access_token = user_secrets.get_secret("personal_access_token")

# !rm -rf /kaggle/working/kaggle_otto
# !git clone -b covis-matrix https://$personal_access_token@github.com/coffeemountain/kaggle_otto.git
    
# import sys
# sys.path.append('/kaggle/working/kaggle_otto/src')

# from covis_matrix_generator import *

Cloning into 'kaggle_otto'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 110 (delta 51), reused 60 (delta 17), pack-reused 0[K
Receiving objects: 100% (110/110), 63.61 KiB | 1.16 MiB/s, done.
Resolving deltas: 100% (51/51), done.


# Step 1 - Candidate Generation

In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

We will use RAPIDS version 21.10.01


In [2]:
%%time
# CACHE FUNCTIONS
def read_file(f):
    return cudf.DataFrame( data_cache[f] )
def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

# CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
data_cache = {}
type_labels = {'clicks':0, 'carts':1, 'orders':2}
files = glob.glob('../input/otto-validation/*_parquet/*')
for f in files: data_cache[f] = read_file_to_cache(f)

# CHUNK PARAMETERS
READ_CT = 5
CHUNK = int( np.ceil( len(files)/6 ))
print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.')

We will process 120 files, in groups of 5 and chunks of 20.
CPU times: user 32.5 s, sys: 5.19 s, total: 37.7 s
Wall time: 43.5 s


## 1) "Carts Orders" Co-visitation Matrix - Type Weighted

In [3]:
%%time
type_weight = {0:1, 1:6, 2:3}

# USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
DISK_PIECES = 4
SIZE = 1.86e6/DISK_PIECES

# COMPUTE IN PARTS FOR MEMORY MANGEMENT
for PART in range(DISK_PIECES):
    print()
    print('### DISK PART',PART+1)
    
    # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
    # => OUTER CHUNKS
    for j in range(6):
        a = j*CHUNK
        b = min( (j+1)*CHUNK, len(files) )
        print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
        
        # => INNER CHUNKS
        for k in range(a,b,READ_CT):
            # READ FILE
            df = [read_file(files[k])]
            for i in range(1,READ_CT): 
                if k+i<b: df.append( read_file(files[k+i]) )
            df = cudf.concat(df,ignore_index=True,axis=0)
            df = df.sort_values(['session','ts'],ascending=[True,False])
            # USE TAIL OF SESSION
            df = df.reset_index(drop=True)
            df['n'] = df.groupby('session').cumcount()
            df = df.loc[df.n<30].drop('n',axis=1)
            # CREATE PAIRS
            df = df.merge(df,on='session')
            df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]
            # MEMORY MANAGEMENT COMPUTE IN PARTS
            df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
            # ASSIGN WEIGHTS
            df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
            df['wgt'] = df.type_y.map(type_weight)
            df = df[['aid_x','aid_y','wgt']]
            df.wgt = df.wgt.astype('float32')
            df = df.groupby(['aid_x','aid_y']).wgt.sum()
            # COMBINE INNER CHUNKS
            if k==a: tmp2 = df
            else: tmp2 = tmp2.add(df, fill_value=0)
            print(k,', ',end='')
        print()
        # COMBINE OUTER CHUNKS
        if a==0: tmp = tmp2
        else: tmp = tmp.add(tmp2, fill_value=0)
        del tmp2, df
        gc.collect()
    # CONVERT MATRIX TO DICTIONARY
    tmp = tmp.reset_index()
    tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
    # SAVE TOP 40
    tmp = tmp.reset_index(drop=True)
    tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
    tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
    # SAVE PART TO DISK (convert to pandas first uses less memory)
    tmp.to_pandas().to_parquet(f'top_15_carts_orders_v{VER}_{PART}.pqt')


### DISK PART 1
Processing files 0 thru 19 in groups of 5...


  "When using a sequence of booleans for `ascending`, "


0 , 5 , 10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processing files 40 thru 59 in groups of 5...
40 , 45 , 50 , 55 , 
Processing files 60 thru 79 in groups of 5...
60 , 65 , 70 , 75 , 
Processing files 80 thru 99 in groups of 5...
80 , 85 , 90 , 95 , 
Processing files 100 thru 119 in groups of 5...
100 , 105 , 110 , 115 , 

### DISK PART 2
Processing files 0 thru 19 in groups of 5...
0 , 5 , 10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processing files 40 thru 59 in groups of 5...
40 , 45 , 50 , 55 , 
Processing files 60 thru 79 in groups of 5...
60 , 65 , 70 , 75 , 
Processing files 80 thru 99 in groups of 5...
80 , 85 , 90 , 95 , 
Processing files 100 thru 119 in groups of 5...
100 , 105 , 110 , 115 , 

### DISK PART 3
Processing files 0 thru 19 in groups of 5...
0 , 5 , 10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processing files 40 thru 59 in groups of 5...
40 , 45 , 50 , 55 , 
Pro

## 2) "Buy2Buy" Co-visitation Matrix

In [4]:
%%time
# USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
DISK_PIECES = 1
SIZE = 1.86e6/DISK_PIECES

# COMPUTE IN PARTS FOR MEMORY MANGEMENT
for PART in range(DISK_PIECES):
    print()
    print('### DISK PART',PART+1)
    
    # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
    # => OUTER CHUNKS
    for j in range(6):
        a = j*CHUNK
        b = min( (j+1)*CHUNK, len(files) )
        print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
        
        # => INNER CHUNKS
        for k in range(a,b,READ_CT):
            # READ FILE
            df = [read_file(files[k])]
            for i in range(1,READ_CT): 
                if k+i<b: df.append( read_file(files[k+i]) )
            df = cudf.concat(df,ignore_index=True,axis=0)
            df = df.loc[df['type'].isin([1,2])] # ONLY WANT CARTS AND ORDERS
            df = df.sort_values(['session','ts'],ascending=[True,False])
            # USE TAIL OF SESSION
            df = df.reset_index(drop=True)
            df['n'] = df.groupby('session').cumcount()
            df = df.loc[df.n<30].drop('n',axis=1)
            # CREATE PAIRS
            df = df.merge(df,on='session')
            df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ] # 14 DAYS
            # MEMORY MANAGEMENT COMPUTE IN PARTS
            df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
            # ASSIGN WEIGHTS
            df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
            df['wgt'] = 1
            df = df[['aid_x','aid_y','wgt']]
            df.wgt = df.wgt.astype('float32')
            df = df.groupby(['aid_x','aid_y']).wgt.sum()
            # COMBINE INNER CHUNKS
            if k==a: tmp2 = df
            else: tmp2 = tmp2.add(df, fill_value=0)
            print(k,', ',end='')
        print()
        # COMBINE OUTER CHUNKS
        if a==0: tmp = tmp2
        else: tmp = tmp.add(tmp2, fill_value=0)
        del tmp2, df
        gc.collect()
    # CONVERT MATRIX TO DICTIONARY
    tmp = tmp.reset_index()
    tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
    # SAVE TOP 40
    tmp = tmp.reset_index(drop=True)
    tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
    tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
    # SAVE PART TO DISK (convert to pandas first uses less memory)
    tmp.to_pandas().to_parquet(f'top_15_buy2buy_v{VER}_{PART}.pqt')


### DISK PART 1
Processing files 0 thru 19 in groups of 5...
0 , 5 , 

  "When using a sequence of booleans for `ascending`, "


10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processing files 40 thru 59 in groups of 5...
40 , 45 , 50 , 55 , 
Processing files 60 thru 79 in groups of 5...
60 , 65 , 70 , 75 , 
Processing files 80 thru 99 in groups of 5...
80 , 85 , 90 , 95 , 
Processing files 100 thru 119 in groups of 5...
100 , 105 , 110 , 115 , 
CPU times: user 13.5 s, sys: 5.49 s, total: 19 s
Wall time: 19.4 s


## 3) "Clicks" Co-visitation Matrix - Time Weighted

In [5]:
%%time
# USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
DISK_PIECES = 4
SIZE = 1.86e6/DISK_PIECES

# COMPUTE IN PARTS FOR MEMORY MANGEMENT
for PART in range(DISK_PIECES):
    print()
    print('### DISK PART',PART+1)
    
    # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
    # => OUTER CHUNKS
    for j in range(6):
        a = j*CHUNK
        b = min( (j+1)*CHUNK, len(files) )
        print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
        
        # => INNER CHUNKS
        for k in range(a,b,READ_CT):
            # READ FILE
            df = [read_file(files[k])]
            for i in range(1,READ_CT): 
                if k+i<b: df.append( read_file(files[k+i]) )
            df = cudf.concat(df,ignore_index=True,axis=0)
            df = df.sort_values(['session','ts'],ascending=[True,False])
            # USE TAIL OF SESSION
            df = df.reset_index(drop=True)
            df['n'] = df.groupby('session').cumcount()
            df = df.loc[df.n<30].drop('n',axis=1)
            # CREATE PAIRS
            df = df.merge(df,on='session')
            df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]
            # MEMORY MANAGEMENT COMPUTE IN PARTS
            df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
            # ASSIGN WEIGHTS
            df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
            df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
            df = df[['aid_x','aid_y','wgt']]
            df.wgt = df.wgt.astype('float32')
            df = df.groupby(['aid_x','aid_y']).wgt.sum()
            # COMBINE INNER CHUNKS
            if k==a: tmp2 = df
            else: tmp2 = tmp2.add(df, fill_value=0)
            print(k,', ',end='')
        print()
        # COMBINE OUTER CHUNKS
        if a==0: tmp = tmp2
        else: tmp = tmp.add(tmp2, fill_value=0)
        del tmp2, df
        gc.collect()
    # CONVERT MATRIX TO DICTIONARY
    tmp = tmp.reset_index()
    tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
    # SAVE TOP 40
    tmp = tmp.reset_index(drop=True)
    tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
    tmp = tmp.loc[tmp.n<20].drop('n',axis=1)
    # SAVE PART TO DISK (convert to pandas first uses less memory)
    tmp.to_pandas().to_parquet(f'top_20_clicks_v{VER}_{PART}.pqt')


### DISK PART 1
Processing files 0 thru 19 in groups of 5...
0 , 5 , 10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processing files 40 thru 59 in groups of 5...
40 , 45 , 50 , 55 , 
Processing files 60 thru 79 in groups of 5...
60 , 65 , 70 , 75 , 
Processing files 80 thru 99 in groups of 5...
80 , 85 , 90 , 95 , 
Processing files 100 thru 119 in groups of 5...
100 , 105 , 110 , 115 , 

### DISK PART 2
Processing files 0 thru 19 in groups of 5...
0 , 5 , 10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processing files 40 thru 59 in groups of 5...
40 , 45 , 50 , 55 , 
Processing files 60 thru 79 in groups of 5...
60 , 65 , 70 , 75 , 
Processing files 80 thru 99 in groups of 5...
80 , 85 , 90 , 95 , 
Processing files 100 thru 119 in groups of 5...
100 , 105 , 110 , 115 , 

### DISK PART 3
Processing files 0 thru 19 in groups of 5...
0 , 5 , 10 , 15 , 
Processing files 20 thru 39 in groups of 5...
20 , 25 , 30 , 35 , 
Processi

In [6]:
# FREE MEMORY
del data_cache, tmp
_ = gc.collect()

In [7]:
%%time
# LOAD THREE CO-VISITATION MATRICES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

top_20_clicks = pqt_to_dict( pd.read_parquet(f'top_20_clicks_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'top_20_clicks_v{VER}_{k}.pqt') ) )
top_20_buys = pqt_to_dict( pd.read_parquet(f'top_15_carts_orders_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'top_15_carts_orders_v{VER}_{k}.pqt') ) )
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'top_15_buy2buy_v{VER}_0.pqt') )

CPU times: user 1min 31s, sys: 4.29 s, total: 1min 36s
Wall time: 1min 34s


# Step 2 - LGBMRanker

In [8]:
# !pip install polars

In [17]:
import polars as pl
from lightgbm.sklearn import LGBMRanker
from tqdm.notebook import tqdm

In [23]:
train = pl.read_parquet('../input/otto-train-and-test-data-for-local-validation/test.parquet')
train

session,aid,ts,type
i32,i32,i32,u8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0
11098530,409236,1661120532,1
11098531,452188,1661119200,0
11098531,1239060,1661119227,0


In [None]:
pl.Series(train['aid'].apply(lambda x: top_20_clicks[x]))

In [19]:
# dfs = []
# for _aid in tqdm(train["aid"].values):
#     _df = cudf.DataFrame(top_20_clicks[train["aid"][0]]).T
#     _df["aid"] = [_aid]
#     dfs += [_df]

In [112]:
# trainデータをある時刻で切った後半部分
train_labels = pl.read_parquet('../input/otto-train-and-test-data-for-local-validation/test_labels.parquet')
train_labels

session,type,ground_truth
i64,str,list[i64]
11098528,"""clicks""",[1679529]
11098528,"""carts""",[1199737]
11098528,"""orders""","[990658, 950341, ... 1033148]"
11098529,"""clicks""",[1105029]
11098530,"""orders""",[409236]
11098531,"""orders""",[1365569]
11098532,"""clicks""",[1596491]
11098533,"""clicks""",[1417450]
11098533,"""carts""","[108676, 1406660, ... 777657]"
11098533,"""orders""","[935297, 652916, ... 1189919]"


In [113]:
type2id = {"clicks": 0, "carts": 1, "orders": 2}

In [114]:
# typeを数値に直すのと、ground_truthを行毎に展開
train_labels = train_labels.explode('ground_truth').with_columns([
    pl.col('ground_truth').alias('aid'),
    pl.col('type').apply(lambda x: type2id[x])
])[['session', 'type', 'aid']]
train_labels

session,type,aid
i64,i64,i64
11098528,0,1679529
11098528,1,1199737
11098528,2,990658
11098528,2,950341
11098528,2,1462506
11098528,2,1561739
11098528,2,907564
11098528,2,369774
11098528,2,440367
11098528,2,92401


In [93]:
train_labels = train_labels.with_columns([
    pl.col('session').cast(pl.datatypes.Int32),
    pl.col('type').cast(pl.datatypes.UInt8),
    pl.col('aid').cast(pl.datatypes.Int32)
])
train_labels

session,type,aid
i32,u8,i32
11098528,0,1679529
11098528,1,1199737
11098528,2,990658
11098528,2,950341
11098528,2,1462506
11098528,2,1561739
11098528,2,907564
11098528,2,369774
11098528,2,440367
11098528,2,92401


In [94]:
# 正解ラベル
train_labels = train_labels.with_column(pl.lit(1).alias('gt'))
train_labels

session,type,aid,gt
i32,u8,i32,i32
11098528,0,1679529,1
11098528,1,1199737,1
11098528,2,990658,1
11098528,2,950341,1
11098528,2,1462506,1
11098528,2,1561739,1
11098528,2,907564,1
11098528,2,369774,1
11098528,2,440367,1
11098528,2,92401,1


In [95]:
# センション内の予測対象を1、学習を0でラベル付け
train = train.join(train_labels, how='left', on=['session', 'type', 'aid']).with_column(pl.col('gt').fill_null(0))
train

session,aid,ts,type,gt
i32,i32,i32,u8,i32
11098528,11830,1661119200,0,0
11098529,1105029,1661119200,0,1
11098530,264500,1661119200,0,0
11098530,264500,1661119288,0,0
11098530,409236,1661119369,0,0
11098530,409236,1661119441,0,0
11098530,409236,1661120165,0,0
11098530,409236,1661120532,1,0
11098531,452188,1661119200,0,0
11098531,1239060,1661119227,0,0


In [96]:
def get_session_lenghts(df):
    return df.groupby('session').agg([
        pl.col('session').count().alias('session_length')
    ])['session_length'].to_numpy()

In [97]:
session_lengths_train = get_session_lenghts(train)
session_lengths_train

array([ 1,  1, 42, ...,  3,  1, 28], dtype=uint32)

## Model training

In [98]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type='gain',
)

In [99]:
feature_cols = ['aid', 'type']
target = 'gt'

In [100]:
# セッションをクエリとして学習
ranker = ranker.fit(
    train[feature_cols].to_pandas(),
    train[target].to_pandas(),
    group=session_lengths_train,
)

## Predict on test data

In [101]:
test = pl.read_parquet('../input/otto-full-optimized-memory-footprint/test.parquet')
test

session,aid,ts,type
i32,i32,i32,u8
12899779,59625,1661724000,0
12899780,1142000,1661724000,0
12899780,582732,1661724058,0
12899780,973453,1661724109,0
12899780,736515,1661724136,0
12899780,1142000,1661724155,0
12899781,141736,1661724000,0
12899781,199008,1661724022,0
12899781,57315,1661724170,0
12899781,194067,1661724246,0


In [102]:
scores = ranker.predict(test[feature_cols].to_pandas())
scores

array([0.10603822, 0.11735017, 0.10297335, ..., 0.10980192, 0.10101469,
       0.11735017])

In [103]:
test = test.with_columns(pl.Series(name='score', values=scores))
test_predictions = test.sort(['session', 'score'], reverse=True).groupby('session').agg([
    pl.col('aid').limit(20).list()
])
test_predictions

session,aid
i32,list[i32]
14571581,[1100210]
14571580,[202353]
14571579,[739876]
14571578,[519105]
14571577,[1141710]
14571576,[1196256]
14571575,[1257071]
14571574,[306024]
14571573,[1823537]
14571572,[986164]


In [104]:
session_types = []
labels = []

for session, preds in zip(test_predictions['session'].to_numpy(), test_predictions['aid'].to_numpy()):
    l = ' '.join(str(p) for p in preds)
    for session_type in ['clicks', 'carts', 'orders']:
        labels.append(l)
        session_types.append(f'{session}_{session_type}')

In [105]:
# session_types

['14571581_clicks',
 '14571581_carts',
 '14571581_orders',
 '14571580_clicks',
 '14571580_carts',
 '14571580_orders',
 '14571579_clicks',
 '14571579_carts',
 '14571579_orders',
 '14571578_clicks',
 '14571578_carts',
 '14571578_orders',
 '14571577_clicks',
 '14571577_carts',
 '14571577_orders',
 '14571576_clicks',
 '14571576_carts',
 '14571576_orders',
 '14571575_clicks',
 '14571575_carts',
 '14571575_orders',
 '14571574_clicks',
 '14571574_carts',
 '14571574_orders',
 '14571573_clicks',
 '14571573_carts',
 '14571573_orders',
 '14571572_clicks',
 '14571572_carts',
 '14571572_orders',
 '14571571_clicks',
 '14571571_carts',
 '14571571_orders',
 '14571570_clicks',
 '14571570_carts',
 '14571570_orders',
 '14571569_clicks',
 '14571569_carts',
 '14571569_orders',
 '14571568_clicks',
 '14571568_carts',
 '14571568_orders',
 '14571567_clicks',
 '14571567_carts',
 '14571567_orders',
 '14571566_clicks',
 '14571566_carts',
 '14571566_orders',
 '14571565_clicks',
 '14571565_carts',
 '14571565_orders

In [106]:
# labels

['1100210',
 '1100210',
 '1100210',
 '202353',
 '202353',
 '202353',
 '739876',
 '739876',
 '739876',
 '519105',
 '519105',
 '519105',
 '1141710',
 '1141710',
 '1141710',
 '1196256',
 '1196256',
 '1196256',
 '1257071',
 '1257071',
 '1257071',
 '306024',
 '306024',
 '306024',
 '1823537',
 '1823537',
 '1823537',
 '986164',
 '986164',
 '986164',
 '60347',
 '60347',
 '60347',
 '389613',
 '389613',
 '389613',
 '27116',
 '27116',
 '27116',
 '9981',
 '9981',
 '9981',
 '1190074',
 '1190074',
 '1190074',
 '1157982',
 '1157982',
 '1157982',
 '368809',
 '368809',
 '368809',
 '1824122',
 '1824122',
 '1824122',
 '1102089',
 '1102089',
 '1102089',
 '313383',
 '313383',
 '313383',
 '975840',
 '975840',
 '975840',
 '1644433',
 '1644433',
 '1644433',
 '1503039',
 '1503039',
 '1503039',
 '333699',
 '333699',
 '333699',
 '1156575',
 '1156575',
 '1156575',
 '27575',
 '27575',
 '27575',
 '1278483',
 '1278483',
 '1278483',
 '1722993',
 '1722993',
 '1722993',
 '556636',
 '556636',
 '556636',
 '1771744',
 '17

In [107]:
submission = pl.DataFrame({'session_type': session_types, 'labels': labels})
submission
# submission.write_csv('submission.csv')

session_type,labels
str,str
"""14571581_click...","""1100210"""
"""14571581_carts...","""1100210"""
"""14571581_order...","""1100210"""
"""14571580_click...","""202353"""
"""14571580_carts...","""202353"""
"""14571580_order...","""202353"""
"""14571579_click...","""739876"""
"""14571579_carts...","""739876"""
"""14571579_order...","""739876"""
"""14571578_click...","""519105"""
