## Exercise 05 - Pandas optimizations


In [1]:
import pandas as pd
import gc


## Load fines data


In [2]:
df = pd.read_csv('../data/fines.csv')
df.head()


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014


## Iterations - loop with range, iloc, and append to list


In [3]:
def calc_loop(df):
    result = []
    for i in range(len(df)):
        row = df.iloc[i]
        value = row['Fines'] / row['Refund'] * row['Year']
        result.append(value)
    return result


In [4]:
%%timeit
df['metric'] = calc_loop(df)


26.7 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Iterations - using iterrows()


In [5]:
def calc_iterrows(df):
    result = []
    for _, row in df.iterrows():
        value = row['Fines'] / row['Refund'] * row['Year']
        result.append(value)
    return result


In [6]:
%%timeit
df['metric'] = calc_iterrows(df)


25 ms ± 430 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Iterations - using apply() with a lambda


In [7]:
def row_metric(row):
    return row['Fines'] / row['Refund'] * row['Year']


In [8]:
%%timeit
df['metric'] = df.apply(lambda row: row_metric(row), axis=1)


5.44 ms ± 208 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Iterations - using Series operations


In [9]:
%%timeit
df['metric'] = df['Fines'] / df['Refund'] * df['Year']


130 μs ± 3.26 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Iterations - using .values from Series


In [10]:
%%timeit
fines_values = df['Fines'].values
refund_values = df['Refund'].values
year_values = df['Year'].values
df['metric'] = fines_values / refund_values * year_values


58.5 μs ± 1.18 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Indexing performance


In [11]:
test_car = 'O136HO197RUS'


In [12]:
%%timeit
df[df['CarNumber'] == test_car]


176 μs ± 2.42 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [13]:
df_indexed = df.set_index('CarNumber')
df_indexed.head()


Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,metric
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2.0,3200.0,Ford,Focus,1989,3182400.0
E432XX77RUS,1.0,6500.0,Toyota,Camry,1995,12967500.0
7184TT36RUS,1.0,2100.0,Ford,Focus,1984,4166400.0
X582HE161RUS,2.0,2000.0,Ford,Focus,2015,2015000.0
92918M178RUS,1.0,5700.0,Ford,Focus,2014,11479800.0


In [14]:
%%timeit
df_indexed.loc[test_car]


34.8 μs ± 321 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Downcasting floats and integers


In [15]:
df.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    float64
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int64  
 6   metric     930 non-null    float64
dtypes: float64(3), int64(1), object(3)
memory usage: 182.1 KB


In [16]:
optimized_df = df.copy()


In [17]:
float_cols = optimized_df.select_dtypes(include=['float64']).columns
optimized_df[float_cols] = optimized_df[float_cols].astype('float32')


In [18]:
int_cols = optimized_df.select_dtypes(include=['int64']).columns
for col in int_cols:
    optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer')


In [19]:
optimized_df.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    float32
 2   Fines      930 non-null    float32
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int16  
 6   metric     930 non-null    float32
dtypes: float32(3), int16(1), object(3)
memory usage: 165.8 KB


## Categories for object columns


In [20]:
object_cols = optimized_df.select_dtypes(include=['object']).columns
for col in object_cols:
    optimized_df[col] = optimized_df[col].astype('category')

optimized_df.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    float32 
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
 6   metric     930 non-null    float32 
dtypes: category(3), float32(3), int16(1)
memory usage: 66.0 KB


## Memory clean with gc and %reset_selective


In [21]:
gc.collect()
del df
gc.collect()
%reset_selective -f df
