In [1]:
# === IMPORT KNIŽNÍC ===
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)

In [None]:
# === DEFINÍCIA CIEST K DÁTAM ===
DATA_DIR = Path("data")

PATH_ACC = DATA_DIR / "acc_20.csv"
PATH_PERS = DATA_DIR / "pers_20.csv"
PATH_VEH = DATA_DIR / "veh_20.csv"
PATH_HOL = DATA_DIR / "US Holiday Dates (2004-2021).csv"

# Overíme, že súbory existujú
for path in [PATH_ACC, PATH_PERS, PATH_VEH, PATH_HOL]:
    print(f"{path.name}: {'OK' if path.exists() else '❌ NOT FOUND, EXTRACT data.zip FIRST!'}")

acc_20.csv: OK
pers_20.csv: OK
veh_20.csv: OK
US Holiday Dates (2004-2021).csv: OK


In [None]:
# === NAČÍTANIE CSV SÚBOROV ===
acc = pd.read_csv(PATH_ACC, encoding='utf-8')
pers = pd.read_csv(PATH_PERS, encoding='latin1', low_memory=False)
veh = pd.read_csv(PATH_VEH, encoding='latin1', low_memory=False)
hol = pd.read_csv(PATH_HOL, encoding='utf-8')

# Kontrola veľkosti a typov
print("ACCIDENTS:", acc.shape)
print("PERSONS:", pers.shape)
print("VEHICLES:", veh.shape)
print("HOLIDAYS:", hol.shape)


ACCIDENTS: (54745, 80)
PERSONS: (131962, 104)
VEHICLES: (94718, 167)
HOLIDAYS: (342, 6)


In [None]:
# === ČISTENIE DÁT ===

# Unikátnosť nehôd v ACC
print("Počet riadkov:", len(acc))
print("Počet unikátnych CASENUM:", acc['CASENUM'].nunique())

if len(acc) == acc['CASENUM'].nunique():
    print("✅ Každý riadok v ACC zodpovedá jednej nehode.")
else:
    print("⚠️ Pozor: duplikované nehody!")
    display(acc[acc['CASENUM'].duplicated(keep=False)].head())
    
    
# Integrity check (CASENUM väzby) ---

acc_ids = set(acc['CASENUM'])
pers_ids = set(pers['CASENUM'])
veh_ids = set(veh['CASENUM'])

missing_in_acc_from_pers = pers_ids - acc_ids
missing_in_acc_from_veh  = veh_ids  - acc_ids

print("PERS bez ACC:", len(missing_in_acc_from_pers))
print("VEH bez ACC:", len(missing_in_acc_from_veh))

# Kontrola NaN hodnôt v dôležitých stĺpcoch ACC
cols_important = ['YEAR', 'MONTH', 'DAY_WEEK', 'HOUR', 'MAXSEV_IMNAME', 'WEATHR_IMNAME']
missing_summary = acc[cols_important].isna().sum()
print("NaN hodnoty v dôležitých stĺpcoch:")
print(missing_summary[missing_summary > 0])


Počet riadkov: 54745
Počet unikátnych CASENUM: 54745
✅ Každý riadok v ACC zodpovedá jednej nehode.
PERS bez ACC: 0
VEH bez ACC: 0
