## Libraries

In [1]:
%reset -f

In [2]:
# Import Libraries

# General Libraries
import datetime as dt
import os
import joblib

import holidays
import numpy as np
import pandas as pd

pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
np.random.seed(10)

import matplotlib as mpl
import matplotlib.pyplot as plt

# Data Visualization
import seaborn as sns

sns.set()
plt.style.use("seaborn-darkgrid")
sns.set(style="darkgrid")
%matplotlib inline

import pickle
from itertools import product
from math import sqrt

from geopy.distance import geodesic
from scipy.spatial.distance import pdist, squareform
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression

# Metrics
from sklearn.metrics import f1_score, mean_squared_error, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV

# Prepossing Libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from workalendar.europe import Russia

# Models
from xgboost import XGBRegressor
import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

DATA_PATH = "../data/3_track/"
MODEL_PATH = "../models/"
SUBMIT_PATH = "../submit/"

In [3]:
def dfOutlook(df):
    print("----------Top-5- Record----------")
    display(df.head(5))
    print("-----------Information-----------")
    print(df.info())
    print("-----------Data Types-----------")
    print(df.dtypes)
    print("----------Missing value-----------")
    print(df.isnull().sum())
    print("----------Null value-----------")
    print(df.isna().sum())
    print("----------Shape of Data----------")
    print(df.shape)
    print("----------Number of dupliacte rows----------")
    print(len(df[df.duplicated()]))


def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype in "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df


def convertDate(
    df,
    cols,
    prefix,
    form="%Y-%m-%d %H:%M:%S",
    suffix=[
        "year",
        "month",
        "day",
        "hour",
        "minute",
        "second",
        "dotw",
        "workDay",
        "week",
    ],
):
    cal = Russia()

    tmp = {
        x: dt.datetime.strptime(x, form) for x in pd.unique(df[cols].values.ravel("K"))
    }
    cash = {
        x: {
            "date": tmp[x],
            "year": tmp[x].year,
            "month": tmp[x].month,
            "day": tmp[x].day,
            "hour": tmp[x].hour,
            "minute": tmp[x].minute,
            "second": tmp[x].second,
            "dotw": tmp[x].isoweekday(),
            "workDay": cal.is_working_day(tmp[x]) * 1,
            "week": tmp[x].isocalendar()[1],
        }
        for x in tmp.keys()
    }

    for newCol, oldCol in zip(prefix, cols):
        for suf in suffix:
            df[newCol + "_" + suf] = df[oldCol].apply(lambda x: cash[x][suf])

    return df


def df_crossjoin(df1, df2, **kwargs):
    df1["_tmpkey"] = 1
    df2["_tmpkey"] = 1

    res = pd.merge(df1, df2, on="_tmpkey", **kwargs).drop("_tmpkey", axis=1)
    #     res.index = pd.MultiIndex.from_product((df1.index, df2.index))

    df1.drop("_tmpkey", axis=1, inplace=True)
    df2.drop("_tmpkey", axis=1, inplace=True)

    return res


def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df


def evaluate_macroF1_lgb(truth, predictions):
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.reshape(len(np.unique(truth)), -1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average="macro")
    return ("macroF1", f1, True)


def lgb_f1_score(y_hat, data):
    
    y_true = data.get_label()
#     print(y_true.shape)
    
#     print(y_hat.shape)
#     print(y_hat[range(9)])
    y_hat = y_hat.reshape((4, y_true.shape[0])).argmax(axis=0)
#     y_hat = np.round(y_hat)
    print(y_true.shape)
    print(y_hat.shape)
    
    return "f1", f1_score(y_true, y_hat, average="macro"), True

def lgb_f1_grad_hess(y_hat, data):
    
    y_true = data.get_label()

    y_hat = np.reshape((y_true.shape[0],4)).transpose().argmax(axis=1)
    
    return "f1", f1_score(y_true, y_hat, average="macro"), True

def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate * np.power(0.99, current_iter)
    return max(lr, min_learning_rate)

# Data cleaning

## Load Data

In [13]:
# Load Data

# train data
train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))

# test data
test = pd.read_csv(os.path.join(DATA_PATH, "test_new.csv"))

# supplement data
traffic = pd.read_csv(os.path.join(DATA_PATH, "traffic.csv"))
repair = pd.read_csv(os.path.join(DATA_PATH, "repair.csv"))
crash_parts = pd.read_csv(
    os.path.join(DATA_PATH, "crash_parts.csv"), usecols=range(1, 12)
)
atmos = pd.read_csv(os.path.join(DATA_PATH, "atmos.csv"), usecols=range(1, 15))
meteo = pd.read_csv(
    os.path.join(DATA_PATH, "meteo.csv"), usecols=range(1, 38), low_memory=False
)
tele2_data = pd.read_csv(os.path.join(DATA_PATH, "tele2_data.csv"))
geo_data = pd.read_csv(os.path.join(DATA_PATH, "geo_data.csv"))

## Convert DateTime, Extract KM from str features

In [None]:
train = convertDate(
    train,
    ["datetime"],
    ["dt"],
    suffix=["year", "month", "day", "hour"],
)
train.drop(columns=["datetime"], inplace=True)

traffic = convertDate(traffic, ["datetime"], ["dt"], suffix=["date"])
traffic.dt_date = traffic.dt_date + dt.timedelta(hours=1)
traffic = convertDate(
    traffic, ["datetime"], ["dt"], suffix=["year", "month", "day", "hour"]
)
traffic["dt_year"] = traffic.dt_date.dt.year
traffic["dt_month"] = traffic.dt_date.dt.month
traffic["dt_day"] = traffic.dt_date.dt.day
traffic["dt_hour"] = traffic.dt_date.dt.hour
traffic.drop(columns=["datetime", "dt_date"], inplace=True)
tmp = pd.DataFrame(traffic.name.unique())[0].str.extract("(.*?(\d+)(?:.*?(\d+))*.*)")
tmp[[1, 2]] = tmp[[1, 2]].astype(int)
tmp["name_km"] = tmp[1] + tmp[2] / 1000
tmp.drop(columns=[1, 2], inplace=True)
tmp.columns = ["name", "name_km"]

repair = convertDate(repair, ["datetime"], ["dt"], form="%Y-%m-%d", suffix=["year"])
tmp = repair.repair_period.str.extract("(\d+)(?:.*?(\d+))*")
tmp = tmp.fillna(method="ffill", axis=0)
tmp.columns = ["period_st", "period_end"]
tmp = tmp.astype(int)
repair = repair.join(tmp)
repair.drop(columns=["datetime", "repair_period"], inplace=True)

kms = []
tmp = pd.DataFrame(repair.repair_description.unique())[0].str.extract(
    "((?:.*?(?:(\d+)\+(\d+)))(?:.*?(?:(\d+)\+(\d+)))(?:.*?(?:(\d+)\+(\d+)))(?:.*?(?:(\d+)\+(\d+).*?)).*)"
)
tmp.dropna(inplace=True)
kms.append(tmp)

tmp = pd.DataFrame(repair.repair_description.unique())[0].str.extract(
    "((?:.*?(?:(\d+)\+(\d+)))(?:.*?(?:(\d+)\+(\d+)))(?:.*?(?:(\d+)\+(\d+))).*)"
)
tmp.dropna(inplace=True)
kms.append(tmp)

tmp = pd.DataFrame(repair.repair_description.unique())[0].str.extract(
    "((?:.*?(?:(\d+)\+(\d+)))(?:.*?(?:(\d+)\+(\d+))).*)"
)
tmp.dropna(inplace=True)
kms.append(tmp)

tmp = pd.DataFrame(repair.repair_description.unique())[0].str.extract(
    "((?:.*?(?:(\d+)\+(\d+))).*)"
)
tmp.dropna(inplace=True)
kms.append(tmp)

kms = pd.concat(kms)
kms.drop_duplicates(subset=[0], keep="first", inplace=True)
kms.reset_index(drop=True, inplace=True)

kms.fillna("-1", inplace=True)
kms[list(range(1, 9))] = kms[list(range(1, 9))].astype(int)

for i in range(4):
    kms["kmgr_" + str(i)] = kms[i * 2 + 1] + kms[i * 2 + 2] / 1000

kms.drop(columns=range(1, 9), inplace=True)
repair = repair.merge(kms, left_on="repair_description", right_on=0)
repair.drop(columns=[0])

crash_parts = convertDate(
    crash_parts, ["datetime"], ["dt", "plan"], form="%Y-%m-%d", suffix=["year"]
)
crash_parts = convertDate(
    crash_parts,
    ["planactiv_year"],
    ["plan"],
    form="%Y-%m-%d",
    suffix=["date"],
)
crash_parts.plan_date = crash_parts.plan_date + dt.timedelta(days=1)
crash_parts["plan_year"] = crash_parts.plan_date.dt.year
crash_parts.drop(columns=["datetime", "planactiv_year", "plan_date"], inplace=True)
# Нужны tfidf из crash_parts.planactiv_descr

atmos = convertDate(
    atmos,
    ["start_date", "end_date"],
    ["start", "end"],
    form="%Y-%m-%d",
    suffix=["year", "month", "day"],
)
atmos.start_ts = atmos.start_ts.fillna("0:00")
atmos.end_ts = atmos.end_ts.fillna("0:00")
atmos = convertDate(
    atmos,
    ["start_ts", "end_ts"],
    ["start", "end"],
    form="%H:%M",
    suffix=["hour", "minute"],
)
atmos.drop(columns=["start_date", "start_ts", "end_date", "end_ts"], inplace=True)
# Нужно ordered label на phenomenon / intensity / *_q

meteo = convertDate(
    meteo, ["measure_dt"], ["dt"], suffix=["year", "month", "day", "hour"]
)
meteo.drop(columns=["measure_dt"], inplace=True)
# Нужно ordered label на phenomenon / intensity / *_q
# ?Нужно? преобразовать visib в км

tele2_data = convertDate(
    tele2_data, ["datetime"], ["dt"], suffix=["year", "month", "day", "hour"]
)
tele2_data.drop(columns=["datetime"], inplace=True)




## Fix coordinates on road

In [None]:
# Многие участи дорог содержат некорректные координаты: nan / <25 / >100 и некорректные км

road_cols = ["road_id", "road_km", "lon", "lat"]
road = train[road_cols].copy()
road.drop_duplicates(inplace=True)
while road[(road.lat > 100) | (road.lon > 100)].shape[0] > 0:
    road.loc[road.lon > 100, "lon"] = road.lon[road.lon > 100] / 10
    road.loc[road.lat > 100, "lat"] = road.lat[road.lat > 100] / 10
road.sort_values(["road_id", "road_km", "lon"], inplace=True)

road.dropna(inplace=True)
road = road[(road.lat > 25) & (road.lon > 25)]
road = road[road.road_id != 5]
km_map = test.groupby("road_id").agg({"road_km": ("min", "max")})
km9 = km_map.loc[9, ("road_km", "max")] + 1
km14 = km_map.loc[14, ("road_km", "max")] + 1

road9 = zip(np.ones(km9, dtype=int) * 9, range(km_map.loc[9, ("road_km", "max")] + 1))
road14 = zip(
    np.ones(km14, dtype=int) * 14, range(km_map.loc[14, ("road_km", "max")] + 1)
)

# road_grid
road_grid = pd.DataFrame(list(road9) + list(road14))
road_grid.columns = ["road_id", "road_km"]

#get median coords for known kms
median_latlon = (
    road.groupby(["road_id", "road_km"])
    .agg({"lat": "median", "lon": "median"})
    .reset_index()
)

# interpolate unknown kms that are present in test set
road_grid = road_grid.merge(median_latlon, on=["road_id", "road_km"], how="left")
road_grid = road_grid.interpolate()

## Sort and pkl for future use

In [None]:
train.sort_values(
    ["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True
)

traffic.sort_values(
    ["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True
)
repair.sort_values(["road_id", "road_km", "period_st", "period_end"], inplace=True)
crash_parts.sort_values(["road_id", "road_km", "dt_year", "plan_year"], inplace=True)
# atmos
# meteo
tele2_data.sort_values(
    ["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True
)

test.sort_values(
    ["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True
)

In [64]:
train = downcast_dtypes(train)
train.to_pickle(os.path.join(DATA_PATH, "train.pkl"))

traffic = downcast_dtypes(traffic)
traffic.to_pickle(os.path.join(DATA_PATH, "traffic.pkl"))

repair.to_pickle(os.path.join(DATA_PATH, "repair.pkl"))
crash_parts.to_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
atmos.to_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
meteo.to_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
tele2_data.to_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
road_grid.to_pickle(os.path.join(DATA_PATH, "road_grid.pkl"))

test.to_pickle(os.path.join(DATA_PATH, "test.pkl"))

# Train grid

## Lets try to fit everything in memory

In [4]:
tmp = pd.DataFrame(
    pd.date_range(start="2012-01-01 01:00", end="2020-10-25 23:00", freq="h"), # changed after test_new
    columns=["datetime"],
)

tmp.datetime = tmp.datetime.astype(str)

time_grid = convertDate(
    tmp,
    ["datetime"],
    ["dt"],
    suffix=["year", "month", "day", "hour", "dotw", "workDay", "week"],
)
time_grid.drop(columns='datetime', inplace=True)

In [5]:
road_grid = pd.read_pickle(os.path.join(DATA_PATH, "road_grid.pkl"))
grid = df_crossjoin(time_grid, road_grid) # fits with swap
grid = downcast_dtypes(grid)
grid.to_pickle(os.path.join(DATA_PATH, "grid.pkl"))

## Merge with train data

In [9]:
train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
# traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid.pkl"))

In [10]:
train.drop(columns=['lon', 'lat', 'road_name'], inplace=True)

#Label Encoding
labelencoder = LabelEncoder()
train['data_source'] = labelencoder.fit_transform(train['data_source'])

In [11]:
grid = pd.merge(
    grid,
    train,
    how='left',
    on=[
        "dt_year",
        "dt_month",
        "dt_day",
        "dt_hour",
        "road_id",
        "road_km",
    ],
)

In [12]:
grid = downcast_dtypes(grid)
grid.to_pickle(os.path.join(DATA_PATH, "grid.pkl"))

## Merge with trafic data

In [16]:
# train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid.pkl"))

In [17]:
traffic.drop(columns=['longitude', 'latitude', 'data_id', 'station_id', 'name'], inplace=True)

#Label Encoding
labelencoder = LabelEncoder()
traffic.direction = labelencoder.fit_transform(traffic.direction)

In [18]:
grid = pd.merge(
    grid,
    traffic,
    how='left',
    on=[
        "dt_year",
        "dt_month",
        "dt_day",
        "dt_hour",
        "road_id",
        "road_km",
    ], 
    copy=False
)

In [19]:
grid = downcast_dtypes(grid)
grid.to_pickle(os.path.join(DATA_PATH, "grid.pkl"))

## fillna and downcast

In [25]:
# train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
# traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid.pkl"))

In [26]:
grid.target = grid.target.fillna(0).astype(np.int16)
grid.man_injured_num = grid.man_injured_num.fillna(0).astype(np.int16) # other value for na?
grid.man_dead_num = grid.man_dead_num.fillna(0).astype(np.int16) # other value for na?

# grid.data_source = grid.data_source.fillna(-1).astype(np.int16) # other value for na?
grid.drop(columns=['data_source'], inplace=True)

grid.lane_count.interpolate(limit_area='inside', inplace=True)
grid.lane_count = grid.lane_count.fillna(method='bfill').fillna(method='ffill').astype(np.int16)

grid.direction = grid.direction.fillna(0.5)#.astype(np.int16) # other value for na?

grid.car_damaged_num = grid.car_damaged_num.fillna(0).astype(np.int16) # other value for na?

grid.volume.interpolate(limit_area='inside', inplace=True)
grid.volume = grid.volume.fillna(method='bfill').fillna(method='ffill').astype(np.int16)

grid.occupancy.interpolate(limit_area='inside', inplace=True)
grid.occupancy = grid.occupancy.fillna(method='bfill').fillna(method='ffill').astype(np.int16)

grid.speed.interpolate(limit_area='inside', inplace=True)
grid.speed = grid.speed.fillna(method='bfill').fillna(method='ffill').astype(np.int16)

grid.lane.interpolate(limit_area='inside', inplace=True)
grid.lane = grid.lane.fillna(method='bfill').fillna(method='ffill').astype(np.int16)

grid.to_pickle(os.path.join(DATA_PATH, "grid_.pkl"))

## Lag features

In [4]:
# train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
# traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_.pkl"))

In [20]:
grid.columns

Index(['dt_year', 'dt_month', 'dt_day', 'dt_hour', 'dt_dotw', 'dt_workDay',
       'dt_week', 'road_id', 'road_km', 'lat', 'lon', 'man_injured_num',
       'man_dead_num', 'car_damaged_num', 'target', 'lane', 'lane_count',
       'direction', 'volume', 'occupancy', 'speed'],
      dtype='object')

In [5]:
grid.sort_values(["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True)

In [6]:
grid.to_pickle(os.path.join(DATA_PATH, "grid_pre_lag.pkl"))

In [10]:
grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_pre_lag.pkl"))

In [11]:
encode_columns = [
    "man_injured_num",
    "man_dead_num",
    "car_damaged_num",
    "target",
    "lane",
    "direction",
    "volume",
    "occupancy",
    "speed",
]

In [12]:
for r in [9, 14]:
    grid.loc[grid.road_id == r, encode_columns] = grid.loc[
        grid.road_id == r, encode_columns
    ].shift(1)

In [14]:
grid = downcast_dtypes(grid)

In [15]:
grid.to_pickle(os.path.join(DATA_PATH, "grid_lag.pkl"))

In [16]:
grid.dropna(inplace=True)

In [17]:
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222461297 entries, 2819 to 222461298
Data columns (total 21 columns):
 #   Column           Dtype  
---  ------           -----  
 0   dt_year          int16  
 1   dt_month         int16  
 2   dt_day           int16  
 3   dt_hour          int16  
 4   dt_dotw          int16  
 5   dt_workDay       int16  
 6   dt_week          int16  
 7   road_id          int16  
 8   road_km          int16  
 9   lat              float32
 10  lon              float32
 11  man_injured_num  float32
 12  man_dead_num     float32
 13  car_damaged_num  float32
 14  target           float32
 15  lane             float32
 16  lane_count       int16  
 17  direction        float32
 18  volume           float32
 19  occupancy        float32
 20  speed            float32
dtypes: float32(11), int16(10)
memory usage: 14.9 GB


In [18]:
grid.man_injured_num = grid.man_injured_num.astype(np.int16)
grid.man_dead_num = grid.man_dead_num.astype(np.int16)
grid.car_damaged_num = grid.car_damaged_num.astype(np.int16)
grid.volume = grid.volume.astype(np.int16)
grid.occupancy = grid.occupancy.astype(np.int16)
grid.speed = grid.speed.astype(np.int16)
grid.lane = grid.lane.astype(np.int16)

In [19]:
grid.to_pickle(os.path.join(DATA_PATH, "grid_lag.pkl"))

# Baseline

## Count weights

In [82]:
# train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
# traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_.pkl"))

In [83]:
grid = grid[(grid.dt_year >= 2020)]

In [88]:
gb = grid.groupby(by=['dt_year', 'target'])

In [89]:
gb.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,dt_month,dt_day,dt_hour,dt_dotw,dt_workDay,dt_week,road_id,road_km,lat,lon,man_injured_num,man_dead_num,car_damaged_num,lane,lane_count,direction,volume,occupancy,speed
dt_year,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020,0,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931,20862931
2020,1,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430,2430
2020,2,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503,503


## LGB

### prepare

In [4]:
# train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
# traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_lag.pkl"))

In [5]:
grid = grid[(grid.dt_year >= 2015) & (grid.dt_year < 2020)]

In [6]:
y_train = grid.target
x_train = grid.drop(columns=['target'])
%reset_selective -f grid

y_train.to_pickle(os.path.join(DATA_PATH, "y_train.pkl"))
%reset_selective -f y_train
x_train.to_pickle(os.path.join(DATA_PATH, "x_train.pkl"))
%reset_selective -f x_train

### train

In [4]:
y_train = pd.read_pickle(os.path.join(DATA_PATH, "y_train.pkl"))
x_train = pd.read_pickle(os.path.join(DATA_PATH, "x_train.pkl"))
lgb_train = lgb.Dataset(x_train, y_train)

y_val = pd.read_pickle(os.path.join(DATA_PATH, "y_test.pkl"))
x_val = pd.read_pickle(os.path.join(DATA_PATH, "x_test.pkl"))
lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)

In [5]:
# params = {
#     "boosting_type": "gbdt",
#     "objective": "binary",
#     "metric": "binary_logloss",
#     "num_leaves": 31,
#     "learning_rate": 0.05,
#     "feature_fraction": 0.9,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 5,
#     "verbose": 0,
# }

fit_params = {
    #     "max_depth": -1,
    "objective": "multiclass",
    "metric": "None",
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "random_state": 42,
    "n_jobs": 8,
    "n_estimators": 10,
    "num_class": 4
    #     "class_weight": "balanced",
}

In [None]:
gbm = lgb.train(
    fit_params,
    lgb_train,
    num_boost_round=10,
#     fobj=lgb_f1_score,
    feval=lgb_f1_score,
    valid_sets=lgb_eval
)

In [None]:
joblib.dump(gbm, os.path.join(MODEL_PATH, 'gbm_train_v2.pkl'))

In [25]:
print('Feature names:', gbm.feature_name())

Feature names: ['dt_year', 'dt_month', 'dt_day', 'dt_hour', 'dt_dotw', 'dt_workDay', 'dt_week', 'road_id', 'road_km', 'lat', 'lon', 'man_injured_num', 'man_dead_num', 'car_damaged_num', 'lane', 'lane_count', 'direction', 'volume', 'occupancy', 'speed']


In [26]:
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [59, 68, 100, 151, 50, 16, 116, 3, 116, 119, 65, 17, 5, 34, 4, 9, 7, 110, 45, 73]


In [32]:
y_val_pred = gbm.predict(x_val)

In [33]:
f1_score(y_val, pd.DataFrame(y_val_pred).idxmax(axis=1), average='macro')

0.6666628400218885

In [29]:
f1_score(y_val, pd.DataFrame(y_val_pred).idxmax(axis=1), average='macro')

0.44991296120408475

In [9]:
clf = lgb.LGBMClassifier(
    max_depth=-1,
    #     learning_rate=0.05,
    objective="multiclass",
    random_state=42,
    silent=False,
    metric="None",
    n_jobs=4,
    n_estimators=10,
    class_weight="balanced",
)

In [11]:
clf.fit(
    x_train,
    y_train,
    # early_stopping_rounds=300,
    eval_metric=evaluate_macroF1_lgb,
    #     eval_set=[(X_test, y_test)]
    # eval_names=["train", "early_stop"],
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1448
[LightGBM] [Info] Number of data points in the train set: 127447278, number of used features: 20
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


LGBMClassifier(class_weight='balanced', metric='None', n_estimators=10,
               n_jobs=4, objective='multiclass', random_state=42, silent=False,
               verbose=1)

In [12]:
joblib.dump(clf, os.path.join(DATA_PATH, 'lgb_lag_early_stop.pkl'))

['../data/3_track/lgb_lag.pkl']

In [14]:
clf = joblib.load(os.path.join(DATA_PATH, 'lgb_lag.pkl'))

In [48]:
from sklearn.metrics import f1_score

In [115]:
# train = pd.read_pickle(os.path.join(DATA_PATH, "train.pkl"))
# traffic = pd.read_pickle(os.path.join(DATA_PATH, "traffic.pkl"))
# repair = pd.read_pickle(os.path.join(DATA_PATH, "repair.pkl"))
# crash_parts = pd.read_pickle(os.path.join(DATA_PATH, "crash_parts.pkl"))
# atmos = pd.read_pickle(os.path.join(DATA_PATH, "atmos.pkl"))
# meteo = pd.read_pickle(os.path.join(DATA_PATH, "meteo.pkl"))
# tele2_data = pd.read_pickle(os.path.join(DATA_PATH, "tele2_data.pkl"))
# test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_lag.pkl"))

In [116]:
grid = grid[(grid.dt_year >= 2020)]

In [117]:
y_val = grid.target
x_val = grid.drop(columns=['target'])
%reset_selective -f grid

y_val.to_pickle(os.path.join(DATA_PATH, "y_val.pkl"))
%reset_selective -f y_val
x_val.to_pickle(os.path.join(DATA_PATH, "x_val.pkl"))
%reset_selective -f x_val

In [118]:
y_val = pd.read_pickle(os.path.join(DATA_PATH, "y_val.pkl"))
x_val = pd.read_pickle(os.path.join(DATA_PATH, "x_val.pkl"))

In [119]:
y_val_pred = clf.predict(x_val)

In [121]:
# с лагами + 10 predictors
f1_score(y_val, y_val_pred, average='macro')

0.7487980292905455

In [56]:
f1_score(y_val, np.zeros_like(y_val), average='macro')

0.33330989627929336

### Test 1

In [34]:
# test data
test = pd.read_csv(os.path.join(DATA_PATH, "test_new.csv"))
test = convertDate(
    test, ["datetime"], ["dt"], suffix=["year", "month", "day", "hour"]
)
test.drop(columns=["datetime"], inplace=True)
test.sort_values(
    ["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True
)
test.drop(columns=['target'], inplace=True)
test.to_pickle(os.path.join(DATA_PATH, "test.pkl"))


In [5]:
test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

In [None]:
grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_lag.pkl"))

In [None]:
grid.head()

In [35]:
test = test.merge(
    grid,
    how="left",
    on=["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"],
)

In [None]:
test.to_pickle(os.path.join(DATA_PATH, "grid_test.pkl"))

In [40]:
%reset_selective -f grid

In [5]:
test = pd.read_pickle(os.path.join(DATA_PATH, "grid_test.pkl"))

In [6]:
y_val = test.target
x_val = test.drop(columns=['target'])
%reset_selective -f test

y_val.to_pickle(os.path.join(DATA_PATH, "y_test.pkl"))
%reset_selective -f y_val
x_val.to_pickle(os.path.join(DATA_PATH, "x_test.pkl"))
%reset_selective -f x_val

In [7]:
y_val = pd.read_pickle(os.path.join(DATA_PATH, "y_test.pkl"))
x_val = pd.read_pickle(os.path.join(DATA_PATH, "x_test.pkl"))

In [8]:
gbm = joblib.load(os.path.join(DATA_PATH, 'gbm_train.pkl'))

In [32]:
y_val_pred = gbm.predict(x_val)

In [15]:
pd.DataFrame(y_val_pred).idxmax(axis=1)

array([0, 1, 2])

In [10]:
f1_score(y_val, pd.DataFrame(y_val_pred).idxmax(axis=1), average='macro')

0.6666628400218885

In [20]:
test = pd.read_csv(os.path.join(DATA_PATH, "test_new.csv"))
test = convertDate(
    test, ["datetime"], ["dt"], suffix=["year", "month", "day", "hour"]
)

In [22]:
x_val['target'] = pd.DataFrame(y_val_pred).idxmax(axis=1)

In [21]:
test.drop(columns='target', inplace=True)

In [26]:
test = test.merge(
    x_val[["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour", "target"]],
    how="left",
    on=["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"],
)

In [28]:
test.drop(columns=["dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True)

In [31]:
test.to_csv(os.path.join(SUBMIT_PATH, "test.csv"))

In [35]:
from check_submit import is_valid_csv

In [None]:
is_valid_csv()

### Test

In [30]:
# test data
test = pd.read_csv(os.path.join(DATA_PATH, "test_new.csv"))
test = convertDate(
    test, ["datetime"], ["dt"], suffix=["year", "month", "day", "hour"]
)
test.drop(columns=["datetime"], inplace=True)
test.sort_values(
    ["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True
)
test.drop(columns=['target'], inplace=True)
test.to_pickle(os.path.join(DATA_PATH, "test.pkl"))


In [31]:
test.head()

Unnamed: 0,road_id,road_km,dt_year,dt_month,dt_day,dt_hour
0,9,0,2020,1,1,1
2776,9,0,2020,1,1,2
5552,9,0,2020,1,1,3
8328,9,0,2020,1,1,4
11104,9,0,2020,1,1,5


In [32]:
test = pd.read_pickle(os.path.join(DATA_PATH, "test.pkl"))

In [33]:
grid = pd.read_pickle(os.path.join(DATA_PATH, "grid_lag.pkl"))

In [34]:
grid.head()

Unnamed: 0,dt_year,dt_month,dt_day,dt_hour,dt_dotw,dt_workDay,dt_week,road_id,road_km,lat,lon,man_injured_num,man_dead_num,car_damaged_num,target,lane,lane_count,direction,volume,occupancy,speed
2819,2012,1,1,2,7,0,52,9,0,57.55,39.79,0,0,0,0.0,1,1,0.5,39,1,73
5638,2012,1,1,3,7,0,52,9,0,57.55,39.79,0,0,0,0.0,1,1,0.5,39,1,73
8457,2012,1,1,4,7,0,52,9,0,57.55,39.79,0,0,0,0.0,1,1,0.5,39,1,73
11276,2012,1,1,5,7,0,52,9,0,57.55,39.79,0,0,0,0.0,1,1,0.5,39,1,73
14095,2012,1,1,6,7,0,52,9,0,57.55,39.79,0,0,0,0.0,1,1,0.5,39,1,73


In [35]:
test = test.merge(
    grid,
    how="left",
    on=["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"],
)

In [36]:
test.to_pickle(os.path.join(DATA_PATH, "grid_test.pkl"))

In [40]:
%reset_selective -f grid

In [41]:
test = pd.read_pickle(os.path.join(DATA_PATH, "grid_test.pkl"))

In [42]:
y_val = test.target
x_val = test.drop(columns=['target'])
%reset_selective -f test

y_val.to_pickle(os.path.join(DATA_PATH, "y_test.pkl"))
%reset_selective -f y_val
x_val.to_pickle(os.path.join(DATA_PATH, "x_test.pkl"))
%reset_selective -f x_val

In [43]:
y_val = pd.read_pickle(os.path.join(DATA_PATH, "y_test.pkl"))
x_val = pd.read_pickle(os.path.join(DATA_PATH, "x_test.pkl"))

In [44]:
clf = joblib.load(os.path.join(DATA_PATH, 'lgb_lag.pkl'))

In [45]:
y_val_pred = clf.predict(x_val)

In [46]:
f1_score(y_val, y_val_pred, average='macro')

0.49396271966192157

In [97]:
x_val = x_val[["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"]]

In [98]:
x_val['target'] = y_val

In [99]:
x_val.target = x_val.target.astype(np.int16)

In [100]:
x_val.dropna(inplace=True)

In [101]:
test = pd.read_csv(os.path.join(DATA_PATH, "test_new.csv"))
test = convertDate(
    test, ["datetime"], ["dt"], suffix=["year", "month", "day", "hour"]
)

In [102]:
test.drop(columns='target', inplace=True)

In [103]:
test = x_val.merge(
    test,
    how="right",
    on=["road_id", "road_km", "dt_year", "dt_month", "dt_day", "dt_hour"],
)

In [104]:
test.drop(columns=["dt_year", "dt_month", "dt_day", "dt_hour"], inplace=True)

In [105]:
test = test[['datetime', 'road_id', 'road_km', 'target']]

In [106]:
test = test[test.target != 0]

In [107]:
test.head()

Unnamed: 0,datetime,road_id,road_km,target
2701,2020-01-01 01:00:00,14,1406,1
821919,2020-07-13 00:00:00,14,19,1
1302182,2020-10-19 00:00:00,14,27,1


In [92]:
test

Unnamed: 0,datetime,road_id,road_km,target


In [91]:
test.to_csv(os.path.join(SUBMIT_PATH, "test.csv"))