# В данной работе будет исследоваться:
1. Зависимость количества ДТП от ям\неровностей на дороге
2. ...


## Installing dependecies, importing some of them and setting global variables

In [None]:
%pip install -qU setuptools pandas numpy plotly rich tqdm geopandas geojson bs4 requests ipywidgets jupyterlab_widgets pydantic

In [None]:
import pickle
from rich import print
from tqdm.notebook import tqdm
from pathlib import Path
# from google.colab import output
# output.enable_custom_widget_manager()

data_accidents_source_url = 'https://dtp-stat.ru/opendata'

## Downloading data
If you are already have data in `./data` folder - skip this section

### Download HTML table and save links to list
source link was added in import section:
```python
data_accidents_source_usr = 'https://dtp-stat.ru/opendata'
```

In [None]:
from bs4 import BeautifulSoup
import urllib.request

html_page = urllib.request.urlopen(data_accidents_source_url)
soup = BeautifulSoup(html_page, "html.parser")
download_links = []
for link in soup.findAll('a'):
    link_href = link.get('href')
    download_links.append(link_href) if '.geojson' in link_href else None
print(download_links)

### Download every file and store it `./data/*.geojson` file
NOTE: `geojson_files` will be changed in next cell

In [None]:
import requests
from pathlib import Path

geojson_files = []
Path.mkdir(Path('data'), exist_ok=True)

for file_link in tqdm(download_links):
    file_data = requests.get(file_link).content
    with open(file_link.replace('https://cms.dtp-stat.ru/media/opendata/', 'data/'), 'wb+') as file:
        geojson_files.append(file.name)
        file.write(file_data)


## Loading data from file and convert it to `CarAccident` class

### Load data from file and store it `geojson_objects` list

In [None]:
import json
import geojson 

# for those who already have the data downloaded, so you can skip the previous cell
geojson_files = [str(file) for file in Path.cwd().glob('data/*.geojson')]
geojson_objects = []

for file in tqdm(geojson_files):
    with open(file, 'r', encoding='utf-8') as f:
            data = json.loads(f.read())
            geojson_objects.append(geojson.FeatureCollection(data['features']))
            #print(data['features'][0]['properties'])
print(geojson_objects[0].features[0]['properties'])

### Create a dataclasses for better data analysis
NOTE that dataclasses are inherits from `pydantic.BaseModel`

In [None]:
from pydantic import BaseModel

class Participant(BaseModel):
    role: str | None
    gender: str | None
    violations: list | None
    health_status: str | None
    years_of_driving_experience: int | None

class Vehicle(BaseModel):
    year: int | None
    brand: str | None
    color: str | None
    category: str | None
    participants: list[Participant] | None

class CarAccident(BaseModel):
    id: int | None
    tags: list | None
    light: str | None
    point: dict | None
    nearby: list | None
    region: str | None
    scheme: str | None
    address: str | None
    weather: list | None 
    category: str | None
    datetime: str | None
    severity: str | None
    vehicles: list[Vehicle] | None 
    dead_count: int | None
    participants: list[Participant] | None
    injured_count: int | None
    parent_region: str | None 
    road_conditions: list | None
    participants_count: int | None
    participant_categories: list | None

In [None]:
# load properties from geojson_objects to CarAccident objects
car_accidents = []
for geojson_object in tqdm(geojson_objects):
    for feature in geojson_object['features']:
        car_accident = CarAccident(**feature['properties'])
        car_accidents.append(car_accident)

print(car_accidents[0])

In [None]:
# create a set variable for every list of CarAccident objects
tags_set = set()
light_set = set()
nearby_set = set()
region_set = set()
scheme_set = set()
address_set = set()
weather_set = set()
category_set = set()
severity_set = set()
dead_count_set = set()
injured_count_set = set()
parent_region_set = set()
road_conditions_set = set()
participants_count_set = set()
participant_categories_set = set()

participant_role_set = set()
participant_gender_set = set()
participant_violations_set = set()
participant_health_status_set = set()
participant_years_of_driving_experience_set = set()

vehicle_year_set = set()
vehicle_brand_set = set()
vehicle_color_set = set()  # xD
vehicle_category_set = set()

for accident in tqdm(car_accidents):
    tags_set.update(accident.tags)
    light_set.update([accident.light])
    nearby_set.update(accident.nearby)
    region_set.update([accident.region])
    scheme_set.update([accident.scheme])
    address_set.update([accident.address])
    weather_set.update(accident.weather)
    category_set.update([accident.category])
    severity_set.update([accident.severity])
    dead_count_set.update([accident.dead_count])
    injured_count_set.update([accident.injured_count])
    parent_region_set.update([accident.parent_region])
    road_conditions_set.update(accident.road_conditions)
    participants_count_set.update([accident.participants_count])
    participant_categories_set.update(accident.participant_categories)
    [participant_violations_set.update(participant.violations) for vehicle in accident.vehicles for participant in vehicle.participants]
    participant_role_set.update(participant.role for vehicle in accident.vehicles for participant in vehicle.participants)
    participant_gender_set.update(participant.gender for vehicle in accident.vehicles for participant in vehicle.participants)
    participant_health_status_set.update(participant.health_status for vehicle in accident.vehicles for participant in vehicle.participants)
    vehicle_year_set.update(str(vehicle.year) for vehicle in accident.vehicles)
    vehicle_brand_set.update(vehicle.brand for vehicle in accident.vehicles)
    vehicle_color_set.update(vehicle.color for vehicle in accident.vehicles)
    vehicle_category_set.update(vehicle.category for vehicle in accident.vehicles)

In [None]:
# print(f'Tags: {tags_set}')
# print(f'Weather: {weather_set}')
# print(f'Participant categories: {participant_categories_set}')
# print(f'Road conditions: {road_conditions_set}')
# print(f'Participant roles: {participant_violations_set}')
# print(f'Vehicle colors: {vehicle_color_set}')

print(f'tags_set = {tags_set}')
print(f'light_set = {light_set}')
print(f'nearby_set = {nearby_set}')
print(f'region_set = {region_set}')
print(f'scheme_set = {scheme_set}')
print(f'address_set = {address_set}')
print(f'weather_set = {weather_set}')
print(f'category_set = {category_set}')
print(f'severity_set = {severity_set}')
print(f'dead_count_set = {dead_count_set}')
print(f'injured_count_set = {injured_count_set}')
print(f'parent_region_set = {parent_region_set}')
print(f'road_conditions_set = {road_conditions_set}')
print(f'participants_count_set = {participants_count_set}')
print(f'participant_categories_set = {participant_categories_set}')

print(f'participant_role_set = {participant_role_set}')
print(f'participant_gender_set = {participant_gender_set}')
print(f'participant_violations_set = {participant_violations_set}')
print(f'participant_health_status_set = {participant_health_status_set}')
print(f'participant_years_of_driving_experience_set = {participant_years_of_driving_experience_set}')

print(f'vehicle_year_set = {vehicle_year_set}')
print(f'vehicle_brand_set = {vehicle_brand_set}')
print(f'vehicle_color_set = {vehicle_color_set}')
print(f'vehicle_category_set = {vehicle_category_set}')