# В данной работе будет исследоваться:
1. Зависимость количества ДТП от ям\неровностей на дороге
2. ...


## Installing dependecies, importing some of them and setting global variables

In [None]:
%pip install -U pip setuptools pandas numpy plotly rich tqdm geopandas pandas geojson bs4 requests ipywidgets==7.7.1 jupyterlab_widgets pydantic jellyfish shapely chardet dash jupyter-dash
!git clone https://github.com/simp37/Russia_geoJSON

In [None]:
import pickle
from rich import print
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd

import json
import geojson 

import difflib
import jellyfish

import requests
from pathlib import Path

from shapely.geometry import shape, Point
import chardet
# from google.colab import output
# output.enable_custom_widget_manager()

data_accidents_source_url = 'https://dtp-stat.ru/opendata'

## Downloading data
If you are already have data in `./data` folder - skip this section

### Download HTML table and save links to list
source link was added in import section:
```python
data_accidents_source_usr = 'https://dtp-stat.ru/opendata'
```

In [None]:
from bs4 import BeautifulSoup
import urllib.request

html_page = urllib.request.urlopen(data_accidents_source_url)
soup = BeautifulSoup(html_page, "html.parser")
download_links = []
for link in soup.findAll('a'):
    link_href = link.get('href')
    download_links.append(link_href) if '.geojson' in link_href else None
print(download_links)

In [None]:
regions = [link.replace('https://cms.dtp-stat.ru/media/opendata/', '').replace('.geojson', '') for link in download_links]
#print(regions)


russia_regions = [str(region.name).replace('.geojson', '') for region in Path('Russia_geoJSON/').glob('*.geojson')]

eq_regions = {}
for region in regions:
    _region = region.replace('-oblast', '').replace('respublika-', '')
    if _region in ['krym', 'sevastopol']:
        continue
    score = 100
    for russia_region in russia_regions:
        if jellyfish.levenshtein_distance(_region, russia_region.lower()) < score:
            score = jellyfish.levenshtein_distance(_region, russia_region.lower())
            eq_regions[region] = russia_region
eq_regions['krasnoiarskii-krai'] =  'Krasnoyarskiiy-kray'
eq_regions['evreiskaia-avtonomnaia-oblast'] = 'Yevreyskaya'
eq_regions['permskii-krai'] = 'Perm'

print(eq_regions)

# 'krasnoiarskii-krai': 'Krasnodarskiy-kray',
# 'evreiskaia-avtonomnaia-oblast': 'Saratovskaya',
# 'permskii-krai': 'Primorskiy_kray',

### SKIP THIS IF YOU ARE ALREADY HAVE ALL THE DATA
Download every file and store it `./data/*.geojson` file

NOTE: `geojson_files` will be changed in next cell

In [None]:
geojson_files = []
Path.mkdir(Path('data'), exist_ok=True)

for file_link in tqdm(download_links):
    file_data = requests.get(file_link).content
    with open(file_link.replace('https://cms.dtp-stat.ru/media/opendata/', 'data/'), 'wb+') as file:
        geojson_files.append(file.name)
        file.write(file_data)


## Loading data from file and convert it to `CarAccident` class

### Load data from file and store it `geojson_objects` list

In [None]:
# for those who already have the data downloaded, so you can skip the previous cell
geojson_files = [str(file) for file in Path.cwd().glob('data/*.geojson')]
Path.mkdir(Path('data/fixed'), exist_ok=True)
geojson_objects = []

for file in tqdm(geojson_files):
    with open(file, 'r', encoding='utf-8') as f:
            try:
                region_name = eq_regions[Path(file).name.replace('.geojson', '')]
            except KeyError:
                continue
            # TODO: change all shape-files encoding to utf-8 and remove chardet
            with open(f'Russia_geoJSON/{region_name}.geojson', 
                      'r', 
                      encoding=chardet.detect(open(f'Russia_geoJSON/{region_name}.geojson', 'rb').read(1500))['encoding']) as poly_file:
                shape_data = shape(json.loads(poly_file.read())['geometry'])
                data = json.loads(f.read())
                _data = list(data['features'])
                for feature in tqdm(data['features']):
                    try:
                        if not shape_data.contains(Point(feature['geometry']['coordinates'])):
                            _data.remove(feature)
                    except Exception as e:
                        _data.remove(feature)
                with open(f'data/fixed/{region_name}.geojson', 'w+') as dump_data:
                    data['features'] = _data
                    data_to_dump = json.dumps(data, ensure_ascii=False).encode('utf8')
                    dump_data.write(data_to_dump.decode())
            geojson_objects.append(geojson.FeatureCollection(data['features']))


In [None]:

# for those who already have the data downloaded, so you can skip the previous cell
geojson_files = [str(file) for file in Path.cwd().glob('data/fixed/*.geojson')]
geojson_objects = []

for file in tqdm(geojson_files):
    with open(file, 'r', encoding='utf-8') as f:
            data = json.loads(f.read())
            geojson_objects.append(geojson.FeatureCollection(data['features']))
            #print(data['features'][0]['properties'])
print(geojson_objects[0].features[0]['properties'])



### Create a dataclasses for better data analysis
NOTE that dataclasses are inherits from `pydantic.BaseModel`

In [None]:
from pydantic import BaseModel

class Participant(BaseModel):
    role: str | None
    gender: str | None
    violations: list | None
    health_status: str | None
    years_of_driving_experience: int | None

class Vehicle(BaseModel):
    year: int | None
    brand: str | None
    color: str | None
    category: str | None
    participants: list[Participant] | None

class CarAccident(BaseModel):
    id: int | None
    tags: list | None
    light: str | None
    point: dict | None
    nearby: list | None
    region: str | None
    scheme: str | None
    address: str | None
    weather: list | None 
    category: str | None
    datetime: str | None
    severity: str | None
    vehicles: list[Vehicle] | None 
    dead_count: int | None
    participants: list[Participant] | None
    injured_count: int | None
    parent_region: str | None 
    road_conditions: list | None
    participants_count: int | None
    participant_categories: list | None

In [None]:
# load properties from geojson_objects to CarAccident objects
car_accidents = []
for geojson_object in tqdm(geojson_objects):
    for feature in geojson_object['features']:
        car_accident = CarAccident(**feature['properties'])
        car_accidents.append(car_accident)

print(car_accidents[0])

In [None]:
Path.mkdir(Path('data/csv'), exist_ok=True)

accidents_df = pd.DataFrame([car_accident.dict() for car_accident in car_accidents])
accidents_df.to_csv('data/csv/data.csv', index=False)

print(accidents_df.head())

In [5]:
accidents_df = pd.read_csv('data/csv/data.csv')

In [6]:
def lat_or_long(point, dim):
    point = str(point)
    return json.loads(point.replace("'", '"'))[dim]

accidents_df['lat'] = accidents_df['point'].apply(lambda x: lat_or_long(x, 'lat'))
accidents_df['long'] = accidents_df['point'].apply(lambda x: lat_or_long(x, 'long'))

print(accidents_df.head())

In [None]:
import plotly.express as px

accidents_df['magn'] = accidents_df['road_conditions'].apply(lambda x: 5 if x == 'Дефекты покрытия' else 2 if x == 'Неровное покрытие' else 0.1)

fig = px.density_mapbox(accidents_df, lat='lat', lon='long', z='magn', radius=1,
                        center=dict(lat=accidents_df['lat'][0], lon=accidents_df['long'][0]), zoom=3,
                        mapbox_style="open-street-map",
                        height=1200, width=1200,
                        hover_data=['region', 'scheme', 'address', 'weather', 'category', 'datetime', 'severity', 'dead_count', 'injured_count', 'parent_region', 'road_conditions', 'participants_count', 'participant_categories'])


fig.show()
# import dash
# import dash_core_components as dcc
# import dash_html_components as html

# app = dash.Dash()
# app.layout = html.Div([
#     dcc.Graph(figure=fig)
# ])

# try:
#     app.run_server(debug=False, use_reloader=False)  # Turn off reloader if inside Jupyter
# except:
#     app.close()


In [None]:
# create a set variable for every list of CarAccident objects
tags_set = set()
light_set = set()
nearby_set = set()
region_set = set()
scheme_set = set()
address_set = set()
weather_set = set()
category_set = set()
severity_set = set()
dead_count_set = set()
injured_count_set = set()
parent_region_set = set()
road_conditions_set = set()
participants_count_set = set()
participant_categories_set = set()

participant_role_set = set()
participant_gender_set = set()
participant_violations_set = set()
participant_health_status_set = set()

vehicle_year_set = set()
vehicle_brand_set = set()
vehicle_color_set = set()  # xD
vehicle_category_set = set()

for accident in tqdm(car_accidents):
    tags_set.update(accident.tags)
    light_set.update([accident.light])
    nearby_set.update(accident.nearby)
    region_set.update([accident.region])
    scheme_set.update([accident.scheme])
    address_set.update([accident.address])
    weather_set.update(accident.weather)
    category_set.update([accident.category])
    severity_set.update([accident.severity])
    dead_count_set.update([accident.dead_count])
    injured_count_set.update([accident.injured_count])
    parent_region_set.update([accident.parent_region])
    road_conditions_set.update(accident.road_conditions)
    participants_count_set.update([accident.participants_count])
    participant_categories_set.update(accident.participant_categories)
    [participant_violations_set.update(participant.violations) for vehicle in accident.vehicles for participant in vehicle.participants]
    participant_role_set.update(participant.role for vehicle in accident.vehicles for participant in vehicle.participants)
    participant_gender_set.update(participant.gender for vehicle in accident.vehicles for participant in vehicle.participants)
    participant_health_status_set.update(participant.health_status for vehicle in accident.vehicles for participant in vehicle.participants)
    vehicle_year_set.update(str(vehicle.year) for vehicle in accident.vehicles)
    vehicle_brand_set.update(vehicle.brand for vehicle in accident.vehicles)
    vehicle_color_set.update(vehicle.color for vehicle in accident.vehicles)
    vehicle_category_set.update(vehicle.category for vehicle in accident.vehicles)

In [None]:
# print(f'Tags: {tags_set}')
# print(f'Weather: {weather_set}')
# print(f'Participant categories: {participant_categories_set}')
# print(f'Road conditions: {road_conditions_set}')
# print(f'Participant roles: {participant_violations_set}')
# print(f'Vehicle colors: {vehicle_color_set}')

with open('sets.txt', 'w+', encoding='utf-8') as f:
    f.write(f'tags_set = {tags_set}\n\n')
    f.write(f'tags_set = {tags_set}\n\n')
    f.write(f'light_set = {light_set}\n\n')
    f.write(f'nearby_set = {nearby_set}\n\n')
    f.write(f'scheme_set = {scheme_set}\n\n')
    f.write(f'weather_set = {weather_set}\n\n')
    f.write(f'category_set = {category_set}\n\n')
    f.write(f'severity_set = {severity_set}\n\n')
    f.write(f'dead_count_set = {dead_count_set}\n\n')
    f.write(f'injured_count_set = {injured_count_set}\n\n')
    f.write(f'road_conditions_set = {road_conditions_set}\n\n')
    f.write(f'participants_count_set = {participants_count_set}\n\n')
    f.write(f'participant_categories_set = {participant_categories_set}\n\n')
    f.write(f'participant_role_set = {participant_role_set}\n\n')
    f.write(f'participant_gender_set = {participant_gender_set}\n\n')
    f.write(f'participant_violations_set = {participant_violations_set}\n\n')
    f.write(f'participant_health_status_set = {participant_health_status_set}\n\n')
    f.write(f'vehicle_year_set = {vehicle_year_set}\n\n')
    f.write(f'vehicle_brand_set = {vehicle_brand_set}\n\n')
    f.write(f'vehicle_color_set = {vehicle_color_set}\n\n')
    f.write(f'vehicle_category_set = {vehicle_category_set}\n\n')