In [13]:
import pandas as pd, numpy as np

## Ensemble notebook

### validation score with weight determination (Tune by orders)

In [14]:
def get_recall(input_df, type_mode='orders'):
    df = input_df.copy()
    df['session_type'] = df['session'].apply(lambda x: str(x) + f'_{type_mode}')
    df = df.sort_values(['session_type','score'],ascending=[True, False]).reset_index(drop=True)
    

    df['n'] = df.groupby('session_type').cumcount()
    df = df.loc[df.n<20].drop(['n','score','session'],axis=1)
    df['aid'] = df['aid'].astype('int32')
    df = df.groupby('session_type')['aid'].apply(list).reset_index()
    df['labels'] = df['aid'].map(lambda x: ''.join(str(x)[1:-1].split(',')))
    df = df.drop(['aid'],axis=1)

    sub = df.loc[df.session_type.str.contains(type_mode)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

    test_labels = pd.read_parquet('../data/input/otto/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==type_mode]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['labels'] = test_labels['labels'].fillna('[]')
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    return recall

def set_baseline_to_zero(df):
    # change baseline to zero for ensemble
    session_min_score = df.groupby('session')['score'].min()
    session_min_score = pd.DataFrame({'session': session_min_score.index, 'min_score': session_min_score.values})
    df = df.merge(session_min_score, on='session', how='left')
    df['score'] = df['score'] - df['min_score']
    df = df.drop('min_score', axis=1)
    return df

In [15]:
# tetsuro order results
path = '../logs/LB0592/score/valid_score/oof_lgbm_orders.parquet'
df_tetsuro = pd.read_parquet(path)
df_tetsuro = df_tetsuro[['session', 'aid', 'score']]
df_tetsuro = set_baseline_to_zero(df_tetsuro)

In [16]:
df_tetsuro.head()

Unnamed: 0,session,aid,score
0,11098528,11830,12.622729
1,11098528,1732105,10.291255
2,11098528,588923,9.169872
3,11098528,876129,8.802416
4,11098528,884502,8.694285


In [23]:
# gunes
path = './gunes/val_predictions_order.pkl'
df_gunes = np.load(path, allow_pickle=True)
df_gunes = df_gunes.rename(columns={'candidates': 'aid', 'predictions': 'score'})
df_gunes = set_baseline_to_zero(df_gunes)

In [24]:
print('tetsuro original recall: ',get_recall(df_tetsuro) )
print('gunes original recall: ',get_recall(df_gunes) )

tetsuro original recall:  0.66254392712486
gunes original recall:  0.6591414700784863


In [25]:
# merge
df_merged = df_tetsuro.merge(df_gunes, how = 'outer', on = ['session', 'aid'])
df_merged['score_x']= df_merged['score_x'].fillna(0)
df_merged['score_y']= df_merged['score_y'].fillna(0)

In [26]:
df_merged

Unnamed: 0,session,aid,score_x,score_y
0,11098528,11830,12.622729,11.204971
1,11098528,1732105,10.291255,8.226718
2,11098528,588923,9.169872,7.166492
3,11098528,876129,8.802416,7.558273
4,11098528,884502,8.694285,7.278060
...,...,...,...,...
140320399,12899778,1251602,0.000000,1.284612
140320400,12899778,185021,0.000000,2.985011
140320401,12899778,1761908,0.000000,0.522776
140320402,12899778,977658,0.000000,2.198438


In [27]:
weight_list = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
for w in weight_list:
    print('weight=', w, 1-w)
    df_merged['score'] = df_merged['score_x'] * w  + df_merged['score_y'] * (1-w)
    print('order recall: ',get_recall(df_merged) )
    df_merged = df_merged.drop('score', axis=1)

weight= 0 1
order recall:  0.6591414700784863
weight= 0.2 0.8
order recall:  0.6619279100423552
weight= 0.4 0.6
order recall:  0.663086532845201
weight= 0.6 0.4
order recall:  0.6636419057589618
weight= 0.8 0.19999999999999996
order recall:  0.6635940287836376
weight= 1.0 0.0
order recall:  0.6626556400672831


In [None]:
# memo
# weight= 0 1 order recall:  0.6591414700784863
# weight= 0.2 0.8 order recall:  0.6619279100423552
# weight= 0.4 0.6 order recall:  0.663086532845201
# weight= 0.6 0.4 order recall:  0.6636419057589618
# weight= 0.8 0.19999999999999996 order recall:  0.6635940287836376
# weight= 1.0 0.0 order recall:  0.6626556400672831


## Submission

In [28]:
# TODO
final_weight_list = [0.6, 0.4]