In [126]:
import pandas as pd
import gc

> checking for zeros and remonig them

In [127]:
fines = pd.read_csv('../ex04/fines.csv')
fines[(fines['Fines'] == 0) | (fines['Refund'] == 0)]

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
925,M777EP777RUS,0,0.0,Aurus,Senat,2018


In [128]:
fines['Refund'].replace(0, 1, inplace=True)
fines['Fines'].replace(0, 100, inplace=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,M777EP777RUS,1,100.00,Aurus,Senat,2018
926,L007OX07RUS,3,20000.00,Nissan,Juke,2010
927,G040AY300RUS,3,50000.00,MINI,Cooper,2001
928,A802YE666RUS,3,6666.00,Daewoo,Matiz,1998


Iterations
---------------

#### Loop approach

In [129]:
%%timeit
arr = []
for i in range(0, len(fines)):
    row = fines.iloc[i]
    arr.append(row['Fines'] / row['Refund'] * row['Year'])
fines['loop'] = arr

81 ms ± 841 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


> 82.9 ms ± 563 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

---


#### iterrows()

In [130]:
%%timeit
arr = []
for _, row in fines.iterrows():
    arr.append(row['Fines'] / row['Refund'] * row['Year'])
fines['iterrows'] = arr

33.7 ms ± 253 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


> 34.6 ms ± 409 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

---


#### apply() and lambda function

In [131]:
%%timeit
fines['apply'] = fines.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

11.5 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


> 11.8 ms ± 77.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

---


#### Series objects

In [132]:
%%timeit
fines['Series'] = fines['Fines'] / fines['Refund'] * fines['Year']

215 µs ± 3.49 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


> 214 µs ± 3.16 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

---


#### Series.values

In [133]:
%%timeit
fines['Series.values'] = fines['Fines'].values / fines['Refund'].values * fines['Year'].values

90.8 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


> 91.4 µs ± 665 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

---


In [134]:
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,loop,iterrows,apply,Series,Series.values
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,3.182400e+06,3.182400e+06,3.182400e+06,3.182400e+06,3.182400e+06
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,1.296750e+07,1.296750e+07,1.296750e+07,1.296750e+07,1.296750e+07
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,4.166400e+06,4.166400e+06,4.166400e+06,4.166400e+06,4.166400e+06
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,2.015000e+06,2.015000e+06,2.015000e+06,2.015000e+06,2.015000e+06
4,92918M178RUS,1,5700.00,Ford,Focus,2014,1.147980e+07,1.147980e+07,1.147980e+07,1.147980e+07,1.147980e+07
...,...,...,...,...,...,...,...,...,...,...,...
925,M777EP777RUS,1,100.00,Aurus,Senat,2018,2.018000e+05,2.018000e+05,2.018000e+05,2.018000e+05,2.018000e+05
926,L007OX07RUS,3,20000.00,Nissan,Juke,2010,1.340000e+07,1.340000e+07,1.340000e+07,1.340000e+07,1.340000e+07
927,G040AY300RUS,3,50000.00,MINI,Cooper,2001,3.335000e+07,3.335000e+07,3.335000e+07,3.335000e+07,3.335000e+07
928,A802YE666RUS,3,6666.00,Daewoo,Matiz,1998,4.439556e+06,4.439556e+06,4.439556e+06,4.439556e+06,4.439556e+06


In [135]:
fines.drop(columns=['iterrows', 'apply', 'Series', 'Series.values'], inplace=True)
fines.rename(columns={'loop': 'Calculations'}, inplace=True)
fines.head(1)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculations
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0


Indexing
-------------

#### without index

In [136]:
%%timeit
fines[fines['CarNumber'] == 'O136HO197RUS']

249 µs ± 1.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


> 254 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

---


#### with index

In [137]:
fines.set_index('CarNumber', inplace=True)
fines.head(3)

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0
E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0
7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0


In [138]:
%%timeit
fines.loc['O136HO197RUS']

117 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


> 121 µs ± 2.35 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

---


Downcasting
--------------

In [139]:
fines.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to X017CM69RUS
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Refund        930 non-null    int64  
 1   Fines         930 non-null    float64
 2   Make          930 non-null    object 
 3   Model         919 non-null    object 
 4   Year          930 non-null    int64  
 5   Calculations  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 236.0 KB


In [140]:
optimized = fines.copy()

fcols = optimized.select_dtypes('float').columns
icols = optimized.select_dtypes('integer').columns

optimized[fcols] = optimized[fcols].apply(pd.to_numeric, downcast='float')
optimized[icols] = optimized[icols].apply(pd.to_numeric, downcast='integer')

optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to X017CM69RUS
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Refund        930 non-null    int8   
 1   Fines         930 non-null    float32
 2   Make          930 non-null    object 
 3   Model         919 non-null    object 
 4   Year          930 non-null    int16  
 5   Calculations  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 216.9 KB


Categories
------------

In [141]:
ocols = optimized.select_dtypes('object').columns

optimized[ocols] = optimized[ocols].apply(lambda t: t.astype('category'))

optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to X017CM69RUS
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Refund        930 non-null    int8    
 1   Fines         930 non-null    float32 
 2   Make          930 non-null    category
 3   Model         919 non-null    category
 4   Year          930 non-null    int16   
 5   Calculations  930 non-null    float32 
dtypes: category(2), float32(2), int16(1), int8(1)
memory usage: 109.0 KB


Memory clean
-------------

In [142]:
%reset_selective fines
gc.collect()

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


1931

In [143]:
fines

NameError: name 'fines' is not defined

In [144]:
optimized

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus,1989,3.182400e+06
E432XX77RUS,1,6500.000000,Toyota,Camry,1995,1.296750e+07
7184TT36RUS,1,2100.000000,Ford,Focus,1984,4.166400e+06
X582HE161RUS,2,2000.000000,Ford,Focus,2015,2.015000e+06
92918M178RUS,1,5700.000000,Ford,Focus,2014,1.147980e+07
...,...,...,...,...,...,...
M777EP777RUS,1,100.000000,Aurus,Senat,2018,2.018000e+05
L007OX07RUS,3,20000.000000,Nissan,Juke,2010,1.340000e+07
G040AY300RUS,3,50000.000000,MINI,Cooper,2001,3.335000e+07
A802YE666RUS,3,6666.000000,Daewoo,Matiz,1998,4.439556e+06
