In [5]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re
import functools
from geopy.distance import geodesic
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import numpy as np

In [32]:
from datetime import timedelta
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

def hour_rounder(t):
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
               + timedelta(hours=t.minute//30))

def get_data(path, target = True):

    traffic = pd.read_csv('data/raw/traffic.csv', usecols=[0, 1, 2, 4, 5, 9, 10, 11], parse_dates=['datetime'])
    traffic['datetime'] = traffic['datetime'].map(lambda x: hour_rounder(x))
    
    repair = pd.read_csv('data/raw/repair.csv', parse_dates=['datetime'])
    repair['year'] = repair['datetime'].map(lambda x: x.year)

    if target:
        df = pd.read_csv(path, usecols=[0, 1, 2, 9, 10], parse_dates=['datetime'])
        df = df.sort_values('datetime', ascending=True)
        df.reset_index(drop=True, inplace=True)

        dfc = df[['datetime', 'road_km', 'target', 'road_id']]
    else:
        dfc = pd.read_csv(path, parse_dates=['datetime'])

    tmp = pd.merge(dfc, traffic, how='outer', on=['datetime', 'road_km', 'road_id'])

    if target:
        condition1 = (tmp['target'].isnull()) & (tmp['data_id'].isnull())
        condition2 = ~(tmp['target'].isnull()) & (tmp['data_id'].isnull())
        tmp = tmp.drop(tmp[condition2].index)
    else:
        condition1 = (tmp['target'].isnull()) & (tmp['data_id'].isnull())
    
    tmp = tmp.drop(tmp[condition1].index)

    tmp['year'] = tmp['datetime'].map(lambda x: x.year)
    index_repair = tmp[tmp.set_index(['road_km','year', 'road_id']).index.isin(repair.set_index(['road_km', 'year', 'road_id']).index)].index
    tmp.loc[index_repair, 'repair'] = 1
    tmp = tmp.fillna(0)

    columns = ['datetime',
                'road_km',
                'volume',
                'occupancy',
                'speed',
                'repair',
                'target']

    if target:
        tmp = tmp[columns]
        tmp.reset_index(drop=True, inplace=True)

    return tmp

### 2. Формирование модели. Обучение. Тестирование.

In [8]:
tmpc = get_data('data/raw/train1.csv')

             datetime  road_km  target  road_id
0 2012-01-01 12:00:00     1276       2       14
1 2012-01-01 22:00:00      278       2        9
2 2012-01-01 23:00:00       72       2        5
3 2012-01-02 07:00:00     1177       2        5
4 2012-01-02 08:00:00     1344       2        5
             datetime  road_km  target  road_id
0 2012-01-01 12:00:00     1276       2       14
1 2012-01-01 22:00:00      278       2        9
2 2012-01-01 23:00:00       72       2        5
3 2012-01-02 07:00:00     1177       2        5
4 2012-01-02 08:00:00     1344       2        5




In [9]:
print("Нулевых значений переменной target: ", tmpc[tmpc['target'] == 0].shape[0])
print("Ненулевых значений переменной target: ", tmpc[tmpc['target'] != 0].shape[0])

Нулевых значений переменной target:  7338398
Ненулевых значений переменной target:  4756


In [10]:
tmpc = tmpc.drop(tmpc[tmpc['target'] == 0].sample(frac=.98).index)

In [11]:
tmp_train, tmp_test = train_test_split(tmpc, test_size=0.2)
print("Размер выборки для обучения: ", tmp_train.shape)
print("Размер выборки для тестирования: ", tmp_test.shape)

Размер выборки для обучения:  (121219, 7)
Размер выборки для тестирования:  (30305, 7)


In [12]:
%%time
columns = ['volume',
           'occupancy',
           'speed',
           'repair']
tmp_train_x = tmp_train[columns]
tmp_train_y = tmp_train['target']

tmp_test_x = tmp_test[columns]
tmp_test_y = tmp_test['target']

CPU times: user 6.96 ms, sys: 0 ns, total: 6.96 ms
Wall time: 6.93 ms


In [13]:
from catboost import CatBoostClassifier

In [14]:
clf = CatBoostClassifier(class_weights=[1, 5, 10])
clf.fit(tmp_train_x, tmp_train_y)
predicted_target = clf.predict(tmp_test_x)

Learning rate set to 0.101061
0:	learn: 0.9669043	total: 66.7ms	remaining: 1m 6s
1:	learn: 0.8679991	total: 80.7ms	remaining: 40.3s
2:	learn: 0.7907465	total: 94ms	remaining: 31.3s
3:	learn: 0.7292481	total: 107ms	remaining: 26.6s
4:	learn: 0.6792563	total: 120ms	remaining: 23.9s
5:	learn: 0.6383913	total: 129ms	remaining: 21.3s
6:	learn: 0.6041563	total: 142ms	remaining: 20.1s
7:	learn: 0.5752169	total: 156ms	remaining: 19.4s
8:	learn: 0.5507982	total: 169ms	remaining: 18.7s
9:	learn: 0.5301846	total: 183ms	remaining: 18.1s
10:	learn: 0.5126132	total: 196ms	remaining: 17.7s
11:	learn: 0.4977955	total: 214ms	remaining: 17.6s
12:	learn: 0.4851944	total: 230ms	remaining: 17.4s
13:	learn: 0.4745701	total: 243ms	remaining: 17.1s
14:	learn: 0.4652669	total: 256ms	remaining: 16.8s
15:	learn: 0.4576307	total: 267ms	remaining: 16.4s
16:	learn: 0.4511100	total: 280ms	remaining: 16.2s
17:	learn: 0.4454305	total: 296ms	remaining: 16.1s
18:	learn: 0.4406232	total: 309ms	remaining: 16s
19:	learn: 0

In [42]:
pd.concat({
    'predicted': pd.Series(predicted_target.flatten()).astype(int).value_counts(), 
    'truth': tmp_test_y.value_counts()
}, axis=1)

Unnamed: 0,predicted,truth
0.0,30204,29394
1.0,81,742
2.0,20,169


In [39]:
f1_score(tmp_test_y, predicted_target, average='macro')

0.37085750981782706

#### Предсказание и оценка модели

In [33]:
test = get_data('data/raw/test1.csv', target=False)

In [34]:
columns = ['volume',
           'occupancy',
           'speed',
           'repair']

In [35]:
test['target'] = clf.predict(test[columns]).astype(int)

In [36]:
test['target'].value_counts()

0    7324358
1      15070
2       3726
Name: target, dtype: int64

In [37]:
from time import time
prediction = test[['datetime','road_id','road_km','target']][test['target'] != 0]
prediction.to_csv(f'data/predictions/sol_{int(time())}.csv', index=False)

In [43]:
test['target'].value_counts()

0    7324358
1      15070
2       3726
Name: target, dtype: int64