In [3]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re
import functools
from geopy.distance import geodesic
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import numpy as np

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

### 1. Подготовка датасета для обучения модели

In [15]:
df = pd.read_csv('data/raw/train.csv', usecols=[0, 1, 2, 9, 10], parse_dates=['datetime'])
df = df.sort_values('datetime', ascending=True)
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,datetime,road_id,road_km,data_source,target
0,2012-01-01 12:00:00,14,1276,gochs,2
1,2012-01-01 22:00:00,9,278,gochs,2


Unnamed: 0,datetime,road_id,road_km,data_source,target
0,2012-01-01 12:00:00,14,1276,gochs,2
1,2012-01-01 22:00:00,9,278,gochs,2


In [16]:
dfc = df[['datetime', 'road_km', 'target', 'road_id']]

#### Обагащение выборки переменными из датасетов `traffic.csv`, `repair.csv`

In [10]:
traffic = pd.read_csv('data/raw/traffic.csv', usecols=[0, 1, 2, 4, 5, 9, 10, 11], parse_dates=['datetime'])
repair = pd.read_csv('data/raw/repair.csv', parse_dates=['datetime'])

In [11]:
def hour_rounder(t):
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
               +timedelta(hours=t.minute//30))

In [12]:
# округление переменной datetime до часа
traffic['datetime'] = traffic['datetime'].map(lambda x: hour_rounder(x))

In [18]:
# объединение датасета с данными о ДТП с данными о трафике
tmp = pd.merge(dfc, traffic, how='outer', on=['datetime', 'road_km', 'road_id'])
condition1 = (tmp['target'].isnull()) & (tmp['data_id'].isnull())
condition2 = ~(tmp['target'].isnull()) & (tmp['data_id'].isnull())
tmp = tmp.drop(tmp[condition1].index)
tmp = tmp.drop(tmp[condition2].index)
tmp = tmp.fillna(0)
tmp.head(2)

Unnamed: 0,datetime,road_km,target,road_id,data_id,station_id,volume,occupancy,speed
22986,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,448.0,3.74,84.1
22987,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,140.0,4.01,77.1


In [22]:
# добавление данных о ремонте дороги на указанном километре
tmp['year'] = tmp['datetime'].map(lambda x: x.year)
repair['year'] = repair['datetime'].map(lambda x: x.year)
index_repair = tmp[tmp.set_index(['road_km','year', 'road_id']).index.isin(repair.set_index(['road_km', 'year', 'road_id']).index)].index
tmp.loc[index_repair, 'repair'] = 1
tmp = tmp.fillna(0)
tmp.head(2)

Unnamed: 0,datetime,road_km,target,road_id,data_id,station_id,volume,occupancy,speed,year,repair
22986,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,448.0,3.74,84.1,2016,0.0
22987,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,140.0,4.01,77.1,2016,0.0


In [23]:
# выбор переменных
columns = ['datetime',
           'road_km',
           'target',
           'volume',
           'occupancy',
           'speed',
           'repair']
tmp = tmp[columns]
tmp.reset_index(drop=True, inplace=True)
tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7343154 entries, 0 to 7343153
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   datetime   datetime64[ns]
 1   road_km    int64         
 2   target     float64       
 3   volume     float64       
 4   occupancy  float64       
 5   speed      float64       
 6   repair     float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 392.2 MB


### 2. Формирование модели. Обучение. Тестирование.

In [24]:
print("Нулевых значений переменной target: ", tmp[tmp['target'] == 0].shape[0])
print("Ненулевых значений переменной target: ", tmp[tmp['target'] != 0].shape[0])

Нулевых значений переменной target:  7338953
Ненулевых значений переменной target:  4201


In [25]:
# снизим объем выборки, рандомно удалив 98% строк с нулевыми значениями для target
tmp = tmp.drop(tmp[tmp['target'] == 0].sample(frac=.98).index)

In [26]:
tmp_train, tmp_test = train_test_split(tmp, test_size=0.2)
print("Размер выборки для обучения: ", tmp_train.shape)
print("Размер выборки для тестирования: ", tmp_test.shape)

Размер выборки для обучения:  (120784, 7)
Размер выборки для тестирования:  (30196, 7)


Выделим 20% всех данных на тестирование, остальные оставим на обучение

In [27]:
%%time
columns = ['volume',
           'occupancy',
           'speed',
           'repair']
tmp_train_x = tmp_train[columns]
tmp_train_y = tmp_train['target']

tmp_test_x = tmp_test[columns]
tmp_test_y = tmp_test['target']

# clf = svm.SVC(gamma='auto', probability=True)
# clf.fit(tmp_train_x, tmp_train_y)
# predicted_target = clf.predict(tmp_test_x)

CPU times: user 77.6 ms, sys: 28.8 ms, total: 106 ms
Wall time: 109 ms


In [28]:
from catboost import CatBoostClassifier

In [29]:
clf = CatBoostClassifier()
clf.fit(tmp_train_x, tmp_train_y)
predicted_target = clf.predict(tmp_test_x)

Learning rate set to 0.101043
0:	learn: 0.9226664	total: 145ms	remaining: 2m 25s
1:	learn: 0.7921989	total: 317ms	remaining: 2m 37s
2:	learn: 0.6906175	total: 474ms	remaining: 2m 37s
3:	learn: 0.6090765	total: 613ms	remaining: 2m 32s
4:	learn: 0.5420947	total: 727ms	remaining: 2m 24s
5:	learn: 0.4863665	total: 782ms	remaining: 2m 9s
6:	learn: 0.4393521	total: 856ms	remaining: 2m 1s
7:	learn: 0.3993778	total: 971ms	remaining: 2m
8:	learn: 0.3652180	total: 1.03s	remaining: 1m 54s
9:	learn: 0.3357576	total: 1.15s	remaining: 1m 53s
10:	learn: 0.3102568	total: 1.25s	remaining: 1m 51s
11:	learn: 0.2880944	total: 1.34s	remaining: 1m 50s
12:	learn: 0.2688392	total: 1.44s	remaining: 1m 49s
13:	learn: 0.2520200	total: 1.56s	remaining: 1m 50s
14:	learn: 0.2373092	total: 1.64s	remaining: 1m 47s
15:	learn: 0.2244196	total: 1.74s	remaining: 1m 47s
16:	learn: 0.2130879	total: 1.85s	remaining: 1m 46s
17:	learn: 0.2031448	total: 1.96s	remaining: 1m 46s
18:	learn: 0.1944754	total: 2.08s	remaining: 1m 47

In [21]:
# оценим модель на тестово сэмпле
f1_score(tmp_test_y, predicted_target, average='macro')

0.32498807820696235

In [30]:
# оценим модель на тестово сэмпле
f1_score(tmp_test_y, predicted_target, average='macro')

0.329559717027649

### 3. Проверка на основной тестовой выборке (предсказание)

In [None]:
# загрузка выборки
test = pd.read_csv('data/raw/test.csv', parse_dates=['datetime'])
# выбор трассы М-8
# test = test[test["road_id"]==9]

Error: Session cannot generate requests

#### Обагащение выборки переменными из датасетов `traffic.csv`, `repair.csv`

In [33]:
# объединение выборки с данными о трафике
test = pd.merge(test, traffic, how='outer', on=['datetime', 'road_km', 'road_id'])
condition1 = (test['target'].isnull()) & (test['data_id'].isnull())
test = test.drop(test[condition1].index)
test.head(2)

In [None]:
# добавление данных о ремонте дороги на указанном километре
test['year'] = test['datetime'].map(lambda x: x.year)
repair['year'] = repair['datetime'].map(lambda x: x.year)

In [None]:
index_repair = test[test.set_index(['road_km','year', 'road_id']).index.isin(repair.set_index(['road_km','year', 'road_id']).index)].index
test.loc[index_repair, 'repair'] = 1
test['repair'] = test['repair'].fillna(0)
test.head()

Unnamed: 0,datetime,road_id,road_km,target,data_id,station_id,volume,occupancy,speed,year,repair
38,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,87.0,1.0,114.671875,2020,0.0
39,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,207.0,1.0,115.708435,2020,0.0
40,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,43.0,20.0,46.453125,2020,0.0
41,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,195.0,1.0,97.796875,2020,0.0
42,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,89.0,1.0,108.21344,2020,0.0


In [None]:
print("Размер выборки для тестирования: ", test.shape)

Размер выборки для тестирования:  (687181, 11)


In [None]:
# выбор переменных
columns = ['volume',
           'occupancy',
           'speed',
           'repair']

#### Предсказание и оценка модели

In [None]:
%%time
# предсказание значений переменной target
test['target'] =  clf.predict(test[columns])

Wall time: 4min 16s


In [None]:
prediction = test[['datetime','road_id','road_km','target']]
prediction.to_csv('prediction.csv', index=False)