In [1]:
import pandas as pd

from skelo.model.elo import EloEstimator

In [2]:
df = pd.read_csv('/home/daniel/Downloads/bq-results-20221008-142606-1665239275955.csv')

In [3]:
df.head(3)

Unnamed: 0,wizepair2_uuid,assay_chembl_id,standard_type,standard_change,standard_relation_1,standard_value_1,molecule_chembl_id_1,standard_relation_2,standard_value_2,molecule_chembl_id_2,f0_,f1_,radius
0,60464d5292bbba795a2428cd27ad7833,CHEMBL752162,Ki,increase,=,664000.0,CHEMBL1235287,=,1223000.0,CHEMBL280690,1a09b8ff812b2451b761bc53994c2677,071801fba5cd6bbdfca6cc049e47e0f1,3
1,410dbc1620307b618895ffb6eae23b6f,CHEMBL900468,IC50,increase,=,184000.0,CHEMBL236687,=,282000.0,CHEMBL237994,a7819ba0c0604bc6b3f2193c43a20177,e41cb4886145842788680dc951e672e0,1
2,622e41e027086bfb3346185feba42c9f,CHEMBL1673901,Inhibition,increase,=,5.0,CHEMBL1673084,=,10.0,CHEMBL1673074,794232d3bbea41006db74ae326800268,4e171e0fcd1260b2cfc785de2c842caf,2


In [4]:
df['molecule_chembl_num_1'] = df.molecule_chembl_id_1.str.extract('(\d+)')
df['molecule_chembl_num_2'] = df.molecule_chembl_id_2.str.extract('(\d+)')

In [5]:
df['timestamp'] = df[['molecule_chembl_num_1', 'molecule_chembl_num_2']].max(axis=1)

In [6]:
def elo(df):
    
    # create a table of winner/loser in that order
    df_winners = df.apply(
        lambda x: 
        pd.Series({'winner': x.f0_, 'loser':x.f1_}) if x.standard_change=='increase' else 
        pd.Series({'winner': x.f1_, 'loser':x.f0_}) if x.standard_change=='decrease' else 
        None, axis=1)
    if isinstance(df_winners, pd.Series): return None
    df_winners = df_winners.join(df)[['wizepair2_uuid', 'timestamp', 'winner', 'loser']].dropna()
    
    # use index as timeseries
    df_winners.sort_values('timestamp', inplace=True)
    labels = len(df_winners) * [1]
    
    # fit model
    model = EloEstimator(
        key1_field="winner",
        key2_field="loser",
        timestamp_field="timestamp",
        initial_time=0
    ).fit(df_winners, labels)
    
    # return proba
    df_winners['proba'] = model.transform(df_winners, output_type='prob', strict_past_data=True)
    return df_winners[['wizepair2_uuid', 'proba']]
    

In [7]:
df_elo = df.groupby('assay_chembl_id').apply(elo)

In [8]:
df_elo.proba.quantile([0, 0.1, 0.5, 0.9, 1])

0.0    0.004088
0.1    0.447386
0.5    0.519082
0.9    0.746442
1.0    0.996977
Name: proba, dtype: float64

In [9]:
def eloo(df):
    
    loo = list(pd.DataFrame())
    for mol in pd.concat([df.molecule_chembl_id_1, df.molecule_chembl_id_2]).unique():
    
        # create a table of winner/loser in that order
        df_winners = df.apply(
            lambda x: 
            pd.Series({'winner': x.f0_, 'loser':x.f1_}) if x.standard_change=='increase' else 
            pd.Series({'winner': x.f1_, 'loser':x.f0_}) if x.standard_change=='decrease' else 
            None, axis=1)
        if isinstance(df_winners, pd.Series): continue
        df_winners = df_winners.join(df)[['wizepair2_uuid', 'molecule_chembl_id_1', 'molecule_chembl_id_2', 'winner', 'loser']].dropna()

        # use loo group as timeseries
        df_winners['timestamp'] = ((df_winners.molecule_chembl_id_1==mol) | (df_winners.molecule_chembl_id_2==mol)).astype(int)
        df_winners.sort_values('timestamp', inplace=True)
        labels = len(df_winners) * [1]

        # fit mode
        model = EloEstimator(
            key1_field="winner",
            key2_field="loser",
            timestamp_field="timestamp",
            initial_time=0
        ).fit(df_winners, labels)

        # return proba
        df_winners['proba'] = model.transform(df_winners, output_type='prob', strict_past_data=True)
        loo.append(df_winners[df_winners.timestamp==1][['wizepair2_uuid', 'proba']])

    try: return pd.concat(loo)
    except ValueError: return None

In [10]:
df_elo = df.groupby('assay_chembl_id').apply(eloo)

In [11]:
df_elo.proba.quantile([0, 0.1, 0.5, 0.9, 1])

0.0    0.002039
0.1    0.397556
0.5    0.551946
0.9    0.800660
1.0    0.997855
Name: proba, dtype: float64