# Efficient Memory use in Pandas

In [24]:
import pandas as pd
import numpy as np

### Create our data

In [61]:
def get_dataset(size):
    df = pd.DataFrame()
    df['position'] = np.random.choice(['left', 'middle', 'right'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [62]:
df = get_dataset(1_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int64  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 38.1+ MB


In [63]:
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

222 ms ± 2.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
225 ms ± 1.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
340 ms ± 7.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


###### Just by casting position type as a category, we reduce the size from 38 MB to 31

###### Then by casting team type as a category, we reduce the size from 31 MB to 24 MB

In [64]:
df = get_dataset(1_000_000)
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int64   
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 24.8+ MB


##### Int Downcasting Value Range

- ###### int8 can store integers from -128 to 127
- ###### int16 can store integers from -32768 to 32767
- ###### int64 can store integers from -9223372036854775807 to 9223372036854775807

In [68]:
print(df['age'].max())
print(df['age'].min())

49
1


###### by casting age type as a int8, we reduce the size from 24 MB to 18 MB

In [69]:
df['age'] = df['age'].astype('int8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int8(1), object(1)
memory usage: 18.1+ MB


In [70]:
df['prob'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1000000 entries, 0 to 999999
Series name: prob
Non-Null Count    Dtype  
--------------    -----  
1000000 non-null  float64
dtypes: float64(1)
memory usage: 7.6 MB


###### by casting prob type as a float32, we reduce the size from 18 MB to 14 MB

In [74]:
df['prob'] = df['prob'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float32 
dtypes: category(2), float32(1), int8(1), object(1)
memory usage: 14.3+ MB


### Casting bool(true/false)

In [77]:
df['win'] = df['win'].map({'yes': True, 'no': False})

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  bool    
 4   prob      1000000 non-null  float32 
dtypes: bool(1), category(2), float32(1), int8(1)
memory usage: 7.6 MB


In [80]:
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

104 ms ± 4.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
101 ms ± 1.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
278 ms ± 30.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
