In [17]:
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import zipfile

# Ukol 1: nacteni dat ze ZIP souboru
def load_data(filename : str) -> pd.DataFrame:
    # tyto konstanty nemente, pomuzou vam pri nacitani
    headers = ["p1", "p36", "p37", "p2a", "weekday(p2a)", "p2b", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13a",
                "p13b", "p13c", "p14", "p15", "p16", "p17", "p18", "p19", "p20", "p21", "p22", "p23", "p24", "p27", "p28",
                "p34", "p35", "p39", "p44", "p45a", "p47", "p48a", "p49", "p50a", "p50b", "p51", "p52", "p53", "p55a",
                "p57", "p58", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "n", "o", "p", "q", "r", "s", "t", "p5a"]

    #def get_dataframe(filename: str, verbose: bool = False) -> pd.DataFrame:
    regions = {
        "PHA": "00",
        "STC": "01",
        "JHC": "02",
        "PLK": "03",
        "ULK": "04",
        "HKK": "05",
        "JHM": "06",
        "MSK": "07",
        "OLK": "14",
        "ZLK": "15",
        "VYS": "16",
        "PAK": "17",
        "LBK": "18",
        "KVK": "19",
    }

    frame_list = []
    with zipfile.ZipFile(filename, 'r') as root_zip:
        for zip_file in root_zip.filelist:
            with zipfile.ZipFile(root_zip.open(zip_file)) as zip:
                for reg_name, reg_code in regions.items():
                    df = pd.read_csv(zip.open(f'{reg_code}.csv'), encoding="cp1250", sep=";", quotechar='"', decimal=",", low_memory=False, names=headers)
                    df["region"] = reg_name
                    frame_list.append(df)

    return pd.concat(frame_list, axis=0)

In [19]:
# Ukol 2: zpracovani dat
def parse_data(df : pd.DataFrame, verbose : bool = False) -> pd.DataFrame:
    df2 = df.copy()
    
    df2 = df2.drop_duplicates(subset="p1", keep="first")
    to_convert = [x for x in df2 if x not in ["p1", "p2a", "p13a", "p13b", "p13c", "p14", "p34", "p37", "p53", "region", "d", "e"]]
    df2[to_convert] = df2[to_convert].astype("category")
    for col in ["d", "e"]:
        df2[col] = pd.to_numeric(df2[col], errors="coerce")
    
    df2["date"] = pd.to_datetime(df2["p2a"]).astype('datetime64[ns]')
    
    if verbose:
        print(f'orig_size={df.memory_usage(deep=True).sum() / 1_000_000} MB')
        print(f'new_size={df2.memory_usage(deep=True).sum() / 1_000_000} MB')

    return df2

In [20]:
df = load_data("data/data.zip")
df

Unnamed: 0,p1,p36,p37,p2a,weekday(p2a),p2b,p6,p7,p8,p9,...,l,n,o,p,q,r,s,t,p5a,region
0,2100160001,4,,2016-01-01,5,55,1,1,0,2,...,,711403.0,,Souhlasnýsesměremúseku,Pomalý,554782.0,451622.0,GN_V0.1UIR-ADR_410,1,PHA
1,2100160002,4,,2016-01-01,5,130,1,3,0,2,...,,,,,,,,,1,PHA
2,2100160003,5,,2016-01-01,5,100,1,2,0,2,...,,,,,,,,,1,PHA
3,2100160004,6,,2016-01-01,5,120,9,0,0,2,...,,,,,,,,,1,PHA
4,2100160005,6,,2016-01-01,5,2560,2,0,0,2,...,,,,,,,,,1,PHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2461,190906210729,2,606.0,2021-12-27,1,1040,9,0,0,2,...,606,1760003.0,,Opačnýkesměruúseku,Pomalý,560294.0,,GN_V0.1UIR-ADR_410,2,KVK
2462,190906210730,6,,2021-12-28,2,905,2,0,0,2,...,,798429.0,,Souhlasnýsesměremúseku,Pomalý,560499.0,552101.0,GN_V0.1UIR-ADR_410,1,KVK
2463,190906210731,6,,2021-12-30,4,350,3,0,3,2,...,,734801.0,,Opačnýkesměruúseku,Pomalý,560286.0,553824.0,GN_V0.1UIR-ADR_410,1,KVK
2464,190906210732,6,,2021-12-30,4,1035,2,0,0,2,...,,1459025.0,,Opačnýkesměruúseku,Pomalý,560286.0,553476.0,GN_V0.1UIR-ADR_410,1,KVK


In [30]:
df2 = parse_data(df, verbose=True)
df2

orig_size=814.85093 MB
new_size=326.698752 MB


Unnamed: 0,p1,p36,p37,p2a,weekday(p2a),p2b,p6,p7,p8,p9,...,n,o,p,q,r,s,t,p5a,region,date
0,2100160001,4,,2016-01-01,5,55,1,1,0,2,...,711403.0,,Souhlasnýsesměremúseku,Pomalý,554782.0,451622.0,GN_V0.1UIR-ADR_410,1,PHA,2016-01-01
1,2100160002,4,,2016-01-01,5,130,1,3,0,2,...,,,,,,,,1,PHA,2016-01-01
2,2100160003,5,,2016-01-01,5,100,1,2,0,2,...,,,,,,,,1,PHA,2016-01-01
3,2100160004,6,,2016-01-01,5,120,9,0,0,2,...,,,,,,,,1,PHA,2016-01-01
4,2100160005,6,,2016-01-01,5,2560,2,0,0,2,...,,,,,,,,1,PHA,2016-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2461,190906210729,2,606.0,2021-12-27,1,1040,9,0,0,2,...,1760003.0,,Opačnýkesměruúseku,Pomalý,560294.0,,GN_V0.1UIR-ADR_410,2,KVK,2021-12-27
2462,190906210730,6,,2021-12-28,2,905,2,0,0,2,...,798429.0,,Souhlasnýsesměremúseku,Pomalý,560499.0,552101.0,GN_V0.1UIR-ADR_410,1,KVK,2021-12-28
2463,190906210731,6,,2021-12-30,4,350,3,0,3,2,...,734801.0,,Opačnýkesměruúseku,Pomalý,560286.0,553824.0,GN_V0.1UIR-ADR_410,1,KVK,2021-12-30
2464,190906210732,6,,2021-12-30,4,1035,2,0,0,2,...,1459025.0,,Opačnýkesměruúseku,Pomalý,560286.0,553476.0,GN_V0.1UIR-ADR_410,1,KVK,2021-12-30


In [51]:
# Ukol 3: počty nehod v jednotlivých regionech podle viditelnosti
def plot_visibility(df: pd.DataFrame, fig_location: str = None,
                    show_figure: bool = False):
       
   def helper_col(row):
        if row["p19"] == 1:
           return "day-ok"
        elif row["p19"] == 2 or row["p19"] == 3:
            return "day-bad"
        elif row["p19"] == 4 or row["p19"] == 5:
            return "night-ok"
        else:
            return "night-bad"

   df2 = df.copy()
   df2 = df2.loc[df["region"].isin(["PHA", "JHM", "ZLK", "OLM"])]
   df2["visibility"] = df2.apply(lambda row: helper_col(row), axis=1)
   group = df2.groupby("visibility")
   print(df2)
   
   
   

In [52]:
plot_visibility(df2, "01_visibility.png")

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc4fffefbe0>


In [4]:
# Ukol4: druh srážky jedoucích vozidel
def plot_direction(df: pd.DataFrame, fig_location: str = None,
                   show_figure: bool = False):
    pass

In [5]:
# Ukol 5: Následky v čase
def plot_consequences(df: pd.DataFrame, fig_location: str = None,
                    show_figure: bool = False):
    pass