In [43]:
import polars as pl

In [44]:
paths = [
    'outputs/ex01_submission.csv',
    'outputs/ex04_submission.csv',
    'outputs/ex08_submission.csv',
    'outputs/ex10_submission.csv',
    'outputs/ex13_submission.csv',
    'outputs/ex16_submission.csv',
    # 'outputs/ex18/ex18_submission.csv',
    # 'outputs/ex19/ex19_submission.csv',
    # 'outputs/ex23/ex23_submission.csv',
    # 'outputs/ex24/ex24_submission.csv',
    # 'outputs/ex25/ex25_submission.csv',
        
        ]

In [45]:
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.lit(weight).alias('vote'))
            .explode('labels')
            .rename({'labels': 'aid'})
            .with_column(pl.col('aid').cast(pl.Int32)) # we are casting the `aids` to `Int32`! memory management is super important to ensure we don't run out of resources
            .with_column(pl.col('vote').cast(pl.UInt8))
    )

In [46]:
subs = []
for i, path in enumerate(paths):
    subs.append(read_sub(path)) # , weight=i
subs

[shape: (100027983, 3)
 ┌─────────────────┬─────────┬──────┐
 │ session_type    ┆ aid     ┆ vote │
 │ ---             ┆ ---     ┆ ---  │
 │ str             ┆ i32     ┆ u8   │
 ╞═════════════════╪═════════╪══════╡
 │ 12899779_clicks ┆ 59625   ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 12899779_clicks ┆ 1253524 ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 12899779_clicks ┆ 737445  ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 12899779_clicks ┆ 438191  ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ ...             ┆ ...     ┆ ...  │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 14571581_orders ┆ 978060  ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 14571581_orders ┆ 1497245 ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 14571581_orders ┆ 1764910 ┆ 1    │
 ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
 │ 14571581_orders ┆ 1550662 ┆ 1    │
 └─────────────────┴─────────┴──────┘,
 shape: (100142592, 3)
 ┌─────────────────┬─────────┬──────┐
 │ session_type    ┆ aid     ┆ vote │
 │ 

In [47]:

subs = subs[0].join(subs[1], how='outer', on=['session_type', 'aid'])\
              .join(subs[2], how='outer', on=['session_type', 'aid'], suffix='_right2') \
              .join(subs[3], how='outer', on=['session_type', 'aid'], suffix='_right3') \
              .join(subs[4], how='outer', on=['session_type', 'aid'], suffix='_right4') \
              .join(subs[5], how='outer', on=['session_type', 'aid'], suffix='_right5') 
            #   .join(subs[6], how='outer', on=['session_type', 'aid'], suffix='_right6') 
subs.head()

session_type,aid,vote,vote_right,vote_right2,vote_right3,vote_right4,vote_right5
str,i32,u8,u8,u8,u8,u8,u8
"""12899779_click...",59625,1,1,1,1,1,1
"""12899779_click...",1253524,1,1,1,1,1,1
"""12899779_click...",737445,1,1,1,1,1,1
"""12899779_click...",438191,1,1,1,1,1,1
"""12899779_click...",731692,1,1,1,1,1,1


In [49]:
subs = (subs
    .fill_null(0)
    .with_column((pl.col('vote') + pl.col('vote_right') + pl.col('vote_right2') + pl.col('vote_right3') + pl.col('vote_right4') + pl.col('vote_right5')).alias('vote_sum'))
    .drop(['vote', 'vote_right', 'vote_right2', 'vote_right3', 'vote_right4', 'vote_right5'])
    .sort(by='vote_sum')
    .reverse()
)

subs.head()

session_type,aid,vote_sum
str,i32,u8
"""14571581_order...",1497245,6
"""14571581_order...",978060,6
"""14571581_order...",174670,6
"""14571581_order...",940217,6
"""14571581_order...",984794,6


In [50]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 6min 1s, sys: 12.5 s, total: 6min 13s
Wall time: 5min 55s


In [51]:
preds

session_type,labels
str,str
"""13945167_order...","""7213 104092 14..."
"""13041598_carts...","""133141 1239142..."
"""14266615_click...","""969875 1604220..."
"""14045288_order...","""1299318 189805..."
"""13722125_order...","""1385888 103291..."
"""14020669_carts...","""380044 126743 ..."
"""14037277_click...","""1361444 168760..."
"""13802914_click...","""1798076 881286..."
"""13648513_carts...","""318105 476460 ..."
"""14108619_order...","""1688877 106584..."


In [52]:
assert len(preds) == 5015409, 'Wrong length of submission'

In [53]:
preds.write_csv(f'outputs/ensemble04.csv') 

In [54]:
!zip outputs/ensemble04.zip outputs/ensemble04.csv

  adding: outputs/ensemble04.csv (deflated 55%)
