In [23]:
# Exercise 04  —  Enrichment and transformations
# ----------------------------------------------------------
import pandas as pd
import numpy as np
from pathlib import Path

# фиксируем сид для воспроизводимости
np.random.seed(21)

DATA_DIR   = Path("..") / "data"
AUTO_JSON  = DATA_DIR / "auto.json"
SURNAME_JS = DATA_DIR / "surname.json"

# 1) читаем auto.json, floats should be displayed with two decimals
fines = pd.read_json(AUTO_JSON)
pd.options.display.float_format = "{:.2f}".format

# 2) sample ×200 (без new CarNumber/Make/Model)
sample = fines.sample(n=200, replace=True, random_state=21).reset_index(drop=True)
concat_rows = pd.concat([fines, sample], ignore_index=True)

# 3) добавляем случайный integer [1980–2019]
concat_rows["Year"] = np.random.randint(1980, 2020, size=len(concat_rows))

# 4) строим таблицу owners без json/re
surnames_df = pd.read_json(SURNAME_JS, orient="values")  # каждая строка — список
# из каждой строки берём первый элемент, убираем всё, что не A–Z
surname_vals = (
    surnames_df[0]
    .str.upper()
    .str.replace(r"[^A-Z]", "", regex=True)
)
# randomly берём по одному SURNAME на каждый уникальный CarNumber (с повторениями)
surname_series = surname_vals.sample(
    n=concat_rows["CarNumber"].nunique(),
    replace=True,
    random_state=21
).reset_index(drop=True)

owners = pd.DataFrame({
    "CarNumber": concat_rows["CarNumber"].drop_duplicates().values,
    "SURNAME":   surname_series.values
})

# 4b) добавляем «нестыковки»
extra_rows = pd.DataFrame({
    "CarNumber": ["NEW111","NEW222","NEW333","NEW444","NEW555"],
    "Make":      ["Dodge"]*5,
    "Model":     ["Neon"]*5,
    "Refund":    [1]*5,
    "Fines":     [1000]*5,
    "Year":      [1990]*5
})
fines_ext = pd.concat([concat_rows, extra_rows], ignore_index=True)

owners = owners.iloc[:-20].copy()  # удаляем последние 20
owners.loc[len(owners)] = ["X123ABC","KING"]
owners.loc[len(owners)] = ["Y456DEF","JONES"]
owners.loc[len(owners)] = ["Z789GHI","WHITE"]

# 5) четыре типа слияния
inner_join = fines_ext.merge(owners, on="CarNumber", how="inner")
outer_join = fines_ext.merge(owners, on="CarNumber", how="outer")
left_join  = fines_ext.merge(owners, on="CarNumber", how="left")
right_join = fines_ext.merge(owners, on="CarNumber", how="right")

# 6) Pivot‑table: суммы штрафов по Make/Model × Year
pivot = (
    fines_ext
    .pivot_table(
        index=["Make","Model"],
        columns="Year",
        values="Fines",
        aggfunc="sum",
        fill_value=0
    )
)

# 7) Сейвим
fines_ext.to_csv(DATA_DIR / "fines.csv", index=False)
owners.to_csv(DATA_DIR / "owners.csv", index=False)


тесты:

In [24]:
concat_rows.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
Year         925
dtype: int64

In [25]:
fines.count()

CarNumber    725
Refund       725
Fines        725
Make         725
Model        716
dtype: int64

In [26]:
bad_chars = set("[]\"")
has_bad = owners["SURNAME"].apply(lambda s: any(ch in s for ch in bad_chars)).any()
print("SURNAME clean of []\"?  ", not has_bad)

SURNAME clean of []"?   True


In [27]:
len(owners) #тут уже после удаления, 514+20-3=531

514

In [28]:
len(fines_ext)

930

In [29]:
inner_join.shape

(903, 7)

In [30]:
outer_join.shape

(933, 7)

In [31]:
left_join.shape

(930, 7)

In [32]:
right_join.shape

(906, 7)

In [33]:
print(pivot)

Year                   1980      1981      1982     1983      1984      1985  \
Make       Model                                                               
Dodge      Neon        0.00      0.00      0.00     0.00      0.00      0.00   
Ford       Focus   62394.59 395589.17 140383.76 63100.00 111294.59 189583.76   
           Mondeo      0.00      0.00      0.00     0.00      0.00      0.00   
Skoda      Octavia 12494.59      0.00   6900.00 11594.59   1200.00  10294.59   
Toyota     Camry   18500.00   8594.59      0.00  7200.00      0.00      0.00   
           Corolla     0.00      0.00   2000.00     0.00      0.00      0.00   
Volkswagen Golf    30900.00      0.00      0.00  8594.59    300.00  24000.00   
           Jetta       0.00      0.00      0.00     0.00      0.00      0.00   
           Passat      0.00   4600.00      0.00  3200.00  10000.00   5000.00   
           Touareg     0.00      0.00      0.00     0.00      0.00   5800.00   

Year                   1986      1987  