In [2]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re
import functools
from geopy.distance import geodesic
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import numpy as np

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

### 1. Подготовка датасета для обучения модели

In [4]:
df = pd.read_csv('data/raw/train.csv', usecols=[0, 1, 2, 9, 10], parse_dates=['datetime'])
df = df.sort_values('datetime', ascending=True)
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,datetime,road_id,road_km,data_source,target
0,2012-01-01 12:00:00,14,1276,gochs,2
1,2012-01-01 22:00:00,9,278,gochs,2


In [5]:
dfc = df[['datetime', 'road_km', 'target', 'road_id']]

#### Обагащение выборки переменными из датасетов `traffic.csv`, `repair.csv`

In [6]:
traffic = pd.read_csv('data/raw/traffic.csv', usecols=[0, 1, 2, 4, 5, 9, 10, 11], parse_dates=['datetime'])
repair = pd.read_csv('data/raw/repair.csv', parse_dates=['datetime'])

In [7]:
def hour_rounder(t):
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
               +timedelta(hours=t.minute//30))

In [8]:
# округление переменной datetime до часа
traffic['datetime'] = traffic['datetime'].map(lambda x: hour_rounder(x))

In [9]:
# объединение датасета с данными о ДТП с данными о трафике
tmp = pd.merge(dfc, traffic, how='outer', on=['datetime', 'road_km', 'road_id'])
condition1 = (tmp['target'].isnull()) & (tmp['data_id'].isnull())
condition2 = ~(tmp['target'].isnull()) & (tmp['data_id'].isnull())
tmp = tmp.drop(tmp[condition1].index)
tmp = tmp.drop(tmp[condition2].index)
tmp = tmp.fillna(0)
tmp.head(2)

Unnamed: 0,datetime,road_km,target,road_id,data_id,station_id,volume,occupancy,speed
22986,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,448.0,3.74,84.1
22987,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,140.0,4.01,77.1


In [11]:
# добавление данных о ремонте дороги на указанном километре
tmp['year'] = tmp['datetime'].map(lambda x: x.year)
repair['year'] = repair['datetime'].map(lambda x: x.year)
index_repair = tmp[tmp.set_index(['road_km','year', 'road_id']).index.isin(repair.set_index(['road_km', 'year', 'road_id']).index)].index
tmp.loc[index_repair, 'repair'] = 1
tmp = tmp.fillna(0)
tmp.head(2)

Unnamed: 0,datetime,road_km,target,road_id,data_id,station_id,volume,occupancy,speed,year,repair
22986,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,448.0,3.74,84.1,2016,0.0
22987,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,140.0,4.01,77.1,2016,0.0


In [17]:
tmp['road_id'].unique()

array([ 9, 14])

In [18]:
# выбор переменных
columns = ['datetime',
           'road_km',
           'target',
           'volume',
           'occupancy',
           'speed',
           'repair']
tmpc = tmp[columns]
tmpc.reset_index(drop=True, inplace=True)
tmpc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7343154 entries, 0 to 7343153
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   datetime   datetime64[ns]
 1   road_km    int64         
 2   target     float64       
 3   volume     float64       
 4   occupancy  float64       
 5   speed      float64       
 6   repair     float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 392.2 MB


### 2. Формирование модели. Обучение. Тестирование.

In [19]:
print("Нулевых значений переменной target: ", tmpc[tmpc['target'] == 0].shape[0])
print("Ненулевых значений переменной target: ", tmpc[tmpc['target'] != 0].shape[0])

Нулевых значений переменной target:  7338953
Ненулевых значений переменной target:  4201


In [20]:
# снизим объем выборки, рандомно удалив 98% строк с нулевыми значениями для target
tmpc = tmpc.drop(tmpc[tmpc['target'] == 0].sample(frac=.98).index)

In [22]:
tmp_train, tmp_test = train_test_split(tmpc, test_size=0.2)
print("Размер выборки для обучения: ", tmp_train.shape)
print("Размер выборки для тестирования: ", tmp_test.shape)

Размер выборки для обучения:  (120784, 7)
Размер выборки для тестирования:  (30196, 7)


Выделим 20% всех данных на тестирование, остальные оставим на обучение

In [23]:
%%time
columns = ['volume',
           'occupancy',
           'speed',
           'repair']
tmp_train_x = tmp_train[columns]
tmp_train_y = tmp_train['target']

tmp_test_x = tmp_test[columns]
tmp_test_y = tmp_test['target']

# clf = svm.SVC(gamma='auto', probability=True)
# clf.fit(tmp_train_x, tmp_train_y)
# predicted_target = clf.predict(tmp_test_x)

CPU times: user 8 ms, sys: 1e+03 ns, total: 8.01 ms
Wall time: 7.79 ms


In [24]:
from catboost import CatBoostClassifier

In [25]:
clf = CatBoostClassifier()
clf.fit(tmp_train_x, tmp_train_y)
predicted_target = clf.predict(tmp_test_x)

Learning rate set to 0.101043
0:	learn: 0.9225371	total: 60.2ms	remaining: 1m
1:	learn: 0.7919812	total: 75ms	remaining: 37.4s
2:	learn: 0.6903246	total: 87.3ms	remaining: 29s
3:	learn: 0.6087205	total: 103ms	remaining: 25.7s
4:	learn: 0.5417171	total: 114ms	remaining: 22.6s
5:	learn: 0.4859453	total: 121ms	remaining: 20.1s
6:	learn: 0.4388823	total: 133ms	remaining: 18.9s
7:	learn: 0.3988674	total: 146ms	remaining: 18.2s
8:	learn: 0.3646761	total: 155ms	remaining: 17.1s
9:	learn: 0.3352542	total: 162ms	remaining: 16.1s
10:	learn: 0.3097890	total: 172ms	remaining: 15.5s
11:	learn: 0.2875723	total: 184ms	remaining: 15.2s
12:	learn: 0.2682122	total: 198ms	remaining: 15s
13:	learn: 0.2514804	total: 206ms	remaining: 14.5s
14:	learn: 0.2367677	total: 221ms	remaining: 14.5s
15:	learn: 0.2238614	total: 235ms	remaining: 14.4s
16:	learn: 0.2126076	total: 244ms	remaining: 14.1s
17:	learn: 0.2026269	total: 258ms	remaining: 14.1s
18:	learn: 0.1938605	total: 271ms	remaining: 14s
19:	learn: 0.186189

In [59]:
pd.Series(predicted_target.flatten()).astype(int).value_counts()

0    30190
1        6
dtype: int64

In [26]:
# оценим модель на тестово сэмпле
f1_score(tmp_test_y, predicted_target, average='macro')

0.33033575825277517

In [30]:
# оценим модель на тестово сэмпле
f1_score(tmp_test_y, predicted_target, average='macro')

0.329559717027649

### 3. Проверка на основной тестовой выборке (предсказание)

In [27]:
# загрузка выборки
test = pd.read_csv('data/raw/test.csv', parse_dates=['datetime'])
# выбор трассы М-8
# test = test[test["road_id"]==9]

In [38]:
test1 = pd.read_csv('data/raw/test.csv', parse_dates=['datetime'])

In [39]:
test1.shape

(12437057, 4)

#### Обагащение выборки переменными из датасетов `traffic.csv`, `repair.csv`

In [28]:
# объединение выборки с данными о трафике
test = pd.merge(test, traffic, how='outer', on=['datetime', 'road_km', 'road_id'])
condition1 = (test['target'].isnull()) & (test['data_id'].isnull())
test = test.drop(test[condition1].index)
test.head(2)

Unnamed: 0,datetime,road_id,road_km,target,data_id,station_id,volume,occupancy,speed
38,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,87.0,1.0,114.671875
39,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,207.0,1.0,115.708435


In [29]:
# добавление данных о ремонте дороги на указанном километре
test['year'] = test['datetime'].map(lambda x: x.year)
repair['year'] = repair['datetime'].map(lambda x: x.year)

In [30]:
index_repair = test[test.set_index(['road_km','year', 'road_id']).index.isin(repair.set_index(['road_km','year', 'road_id']).index)].index
test.loc[index_repair, 'repair'] = 1
test['repair'] = test['repair'].fillna(0)
test.head()

Unnamed: 0,datetime,road_id,road_km,target,data_id,station_id,volume,occupancy,speed,year,repair
38,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,87.0,1.0,114.671875,2020,0.0
39,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,207.0,1.0,115.708435,2020,0.0
40,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,43.0,20.0,46.453125,2020,0.0
41,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,195.0,1.0,97.796875,2020,0.0
42,2020-01-01 01:00:00,9,38,,34604682.0,53224.0,89.0,1.0,108.21344,2020,0.0


In [32]:
print("Размер выборки для тестирования: ", test.shape)

Размер выборки для тестирования:  (7343154, 11)


In [33]:
# выбор переменных
columns = ['volume',
           'occupancy',
           'speed',
           'repair']

#### Предсказание и оценка модели

In [63]:
%%time
# предсказание значений переменной target
test['target'] =  clf.predict(test[columns]).astype(int)

CPU times: user 48.4 s, sys: 470 ms, total: 48.9 s
Wall time: 2.85 s


In [37]:
from time import time
prediction = test[['datetime','road_id','road_km','target']][test['target'] != 0]
prediction.to_csv(f'data/predictions/sol_{int(time())}.csv', index=False)

In [64]:
test['target'].value_counts()

0    7341942
1       1212
Name: target, dtype: int64