In [2]:
# Exercise 05  —  Pandas optimizations
# ----------------------------------------------------------
import pandas as pd
import gc
from pathlib import Path

DATA_DIR   = Path("..") / "data"
FINES_FILE = DATA_DIR / "fines.csv"

df = pd.read_csv(FINES_FILE)

In [3]:
# --- ITERATIONS --------------------------------------------------------------

expr = lambda x: (x["Fines"] / x["Refund"]) * x["Year"]

def loop_calc(frame):
    out = []
    for i in range(len(frame)):
        row = frame.iloc[i]
        out.append(expr(row))
    return out

In [4]:
%%timeit
# 1) обычный Python‑loop

df["metric_loop"] = loop_calc(df)


18.5 ms ± 387 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
%%timeit
# 2) через iterrows()
df["metric_iterrows"] = [expr(row) for _, row in df.iterrows()]

14.8 ms ± 268 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%%timeit
# 3) через apply()
df["metric_apply"] = df.apply(expr, axis=1)

3.47 ms ± 42.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%%timeit
# 4) через pandas‑векторы
df["metric_vector"] = df["Fines"] / df["Refund"] * df["Year"]

100 μs ± 1.3 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [8]:
%%timeit
# 5) через pandas‑векторы + .values
d = df["Fines"].values / df["Refund"].values * df["Year"].values
df["metric_vals"] = d

# 5b) колонка strange, сразу в float32
df["strange"] = (df["Fines"] / df["Refund"] * df["Year"]).astype("float32")


197 μs ± 9.62 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
%%timeit
# --- INDEXING ---------------------------------------------------------------

# 6) поиск без индекса
_ = df[df["CarNumber"] == "O136HO197RUS"]



148 μs ± 3.48 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [10]:
df_indexed = df.set_index("CarNumber")

In [11]:

%%timeit
# 7) поиск по индексу
_ = df_indexed.loc["O136HO197RUS"]

30.2 μs ± 317 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [12]:
# --- DOWNCASTING ------------------------------------------------------------
optimized = df[
    ["CarNumber", "Refund", "Fines", "Make", "Model", "Year", "strange"]
].copy()

optimized["Refund"] = pd.to_numeric(optimized["Refund"], downcast="integer")
optimized["Fines"]  = pd.to_numeric(optimized["Fines"],  downcast="float")
optimized["Year"]   = pd.to_numeric(optimized["Year"],   downcast="integer")

for col in ["CarNumber", "Make", "Model"]:
    optimized[col] = optimized[col].astype("category")

optimized.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    int8    
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
 6   strange    930 non-null    float32 
dtypes: category(3), float32(2), int16(1), int8(1)
memory usage: 63.3 KB


In [13]:
# --- CATEGORIES -------------------------------------------------------------

for col in optimized.select_dtypes(include="object"):
    optimized[col] = optimized[col].astype("category")

print("\nAfter categories:")
optimized.info(memory_usage="deep")


After categories:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    int8    
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
 6   strange    930 non-null    float32 
dtypes: category(3), float32(2), int16(1), int8(1)
memory usage: 63.3 KB


In [14]:
%reset_selective -f df
# --- MEMORY CLEAN -----------------------------------------------------------

# удаляем df
gc.collect()

0

tests:

In [None]:
# df

NameError: name 'df' is not defined