In [1]:
import pandas as pd
import numpy as np

def get_dataset(size):
    df = pd.DataFrame()
    df["position"] = np.random.choice(['left','middle','right'],size)
    df['age'] = np.random.randint(1,50,size)
    df['team'] = np.random.choice(['red','blue','yellow','green'],size)
    df['win'] = np.random.choice(['yes','no'],size)
    df['prob'] = np.random.uniform(0,1,size)
    return df

In [2]:
df = get_dataset(1000000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int32  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int32(1), object(3)
memory usage: 34.3+ MB


In [3]:
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

558 ms ± 33.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
615 ms ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
682 ms ± 6.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
def set_dtypes(df):
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float16')
    df['win'] = df['win'].map({'yes':True,'no':False})
    return df

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   position       1000000 non-null  object 
 1   age            1000000 non-null  int32  
 2   team           1000000 non-null  object 
 3   win            1000000 non-null  object 
 4   prob           1000000 non-null  float64
 5   age_rank       1000000 non-null  float64
 6   prob_rank      1000000 non-null  float64
 7   win_prob_rank  1000000 non-null  float64
dtypes: float64(4), int32(1), object(3)
memory usage: 57.2+ MB


In [7]:
df = get_dataset(1000000)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

541 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
587 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
678 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
df = get_dataset(1000000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()



321 ms ± 7.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)




468 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)




497 ms ± 24.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
