In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from catboost import Pool

from data import get_data
from train import train_loop

In [2]:
cats = [
    'repair',
    'night',
    'month',
    'weekday'
]

drop_cols = [
    'road_id',
    'road_km',
    'datetime'
]

In [3]:
train = get_data('data/raw/train1.csv', cats=cats)

In [4]:
train.head()

Unnamed: 0,datetime,road_km,target,road_id,data_id,station_id,volume,occupancy,speed,year,repair,night,month,weekday
0,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,448.0,3.74,84.1,2016,0,0,1,1
1,2016-01-05 09:00:00,790,2.0,9,9554029.0,41104.0,140.0,4.01,77.1,2016,0,0,1,1
2,2016-01-06 14:00:00,1151,1.0,9,9566108.0,41103.0,296.0,4.16,61.7,2016,0,0,1,2
3,2016-01-06 14:00:00,1151,1.0,9,9566108.0,41103.0,736.0,12.46,65.4,2016,0,0,1,2
4,2016-01-06 14:00:00,1151,1.0,9,9566108.0,41103.0,432.0,7.67,56.9,2016,0,0,1,2


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier, Pool
import pandas as pd

def train_loop(data, cats, class_weights=[1, 1, 1]):
    data = data.drop(data[data['target'] == 0].sample(frac=.98).index)

    train, test = train_test_split(data, test_size=0.2)

    print("Размер выборки для обучения: ", train.shape)
    print("Размер выборки для тестирования: ", test.shape)
    
    train_x = train.drop('target', axis=1)
    train_y = train['target']

    test_x = test.drop('target', axis=1)
    test_y = test['target']

    train_pool = Pool(train_x, train_y, cat_features=cats)
    test_pool = Pool(test_x, test_y, cat_features=cats)

    clf = CatBoostClassifier(class_weights=class_weights, logging_level='Silent')
    clf.fit(train_pool)

    prediction = clf.predict(test_pool)

    score = f1_score(test_y, prediction, average='macro')

    print(f"F1-score: {score}")

    print(pd.concat({
        'predicted': pd.Series(prediction.flatten()).astype(int).value_counts(), 
        'truth': test_y.value_counts()
    }, axis=1))

    return clf, score

In [7]:
clf, score = train_loop(train.drop(drop_cols, axis=1), cats=cats)

Размер выборки для обучения:  (121219, 11)
Размер выборки для тестирования:  (30305, 11)
F1-score: 0.36799489809435015
     predicted  truth
0.0      30222  29355
1.0         82    771
2.0          1    179


In [8]:
test = get_data('data/raw/test1.csv', target=False, cats=cats)

In [9]:
test_pool = Pool(test.drop(drop_cols, axis=1), cat_features=cats)
test['target'] = clf.predict(test_pool).astype(int)

In [10]:
test['target'].value_counts()

0    7335607
1       7469
2         78
Name: target, dtype: int64

In [10]:
from time import time
prediction = test[['datetime','road_id','road_km','target']][test['target'] != 0]
prediction.to_csv(f'data/predictions/sol_{int(time())}.csv', index=False)

In [11]:
test.head()

Unnamed: 0,datetime,road_id,road_km,target,data_id,station_id,volume,occupancy,speed,year,repair,night,month,weekday
38,2020-01-01 01:00:00,9,38,0,34604682.0,53224.0,87.0,1.0,114.671875,2020,0.0,1,1,2
39,2020-01-01 01:00:00,9,38,0,34604682.0,53224.0,207.0,1.0,115.708435,2020,0.0,1,1,2
40,2020-01-01 01:00:00,9,38,0,34604682.0,53224.0,43.0,20.0,46.453125,2020,0.0,1,1,2
41,2020-01-01 01:00:00,9,38,0,34604682.0,53224.0,195.0,1.0,97.796875,2020,0.0,1,1,2
42,2020-01-01 01:00:00,9,38,0,34604682.0,53224.0,89.0,1.0,108.21344,2020,0.0,1,1,2
