In [None]:
cd "/content/drive/My Drive/Colab Notebooks/DataWorkshop/DSmasterclass_Oct2021/notebooks"

/content/drive/My Drive/Colab Notebooks/DataWorkshop/DSmasterclass_Oct2021/notebooks


In [None]:
!pip install catboost
!pip install eli5



In [None]:
import pandas as pd
import numpy as np
np.random.seed(0)
from sklearn.model_selection import cross_val_score, KFold
import lightgbm as lgbm

import catboost as cb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import eli5
from tqdm import tqdm
import re



## Wczytywanie danych

In [None]:
df_train = pd.read_csv('../input/tram.train.csv')
df_test = pd.read_csv('../input/tram.test.csv') #zbiór testowy bez odpowiedzi

## Połączenie danych

Łączymy `df_train` i ` df_test` i zapisujemy wynik do `df`.

In [None]:
df = pd.concat([df_train, df_test])
df.shape

(308152, 11)

In [None]:
df

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num
0,0,0.0,2018-07-23 06:00:47,612,Borsucza,22,Walcownia,2018-07-23 06:00:00,6.352185e+18,6351558574044899587,7.0
1,1,0.0,2018-07-23 06:00:48,572,Smolki,11,Czerwone Maki P+R,2018-07-23 06:00:00,6.352185e+18,6351558574044670211,10.0
2,2,0.0,2018-07-23 06:00:49,322,Filharmonia,8,Bronowice Małe,2018-07-23 06:01:00,6.352185e+18,6351558574044592386,15.0
3,3,0.0,2018-07-23 06:00:51,363,Hala Targowa,1,Salwator,2018-07-23 06:01:00,6.352185e+18,6351558574044379394,24.0
4,4,0.0,2018-07-23 06:00:52,78,Batorego,24,Bronowice Małe,2018-07-23 06:00:00,6.352185e+18,6351558574044948738,19.0
...,...,...,...,...,...,...,...,...,...,...,...
132161,308141,,,89,Bronowice,14,Bronowice Małe,2018-07-31 23:55:00,6.352185e+18,6351558574044741905,33.0
132162,308142,,,2690,Kampus UJ,18,Czerwone Maki P+R,2018-07-31 23:57:00,6.352185e+18,6351558574044791061,24.0
132163,308143,,,133,Wesele,14,Bronowice Małe,2018-07-31 23:56:00,6.352185e+18,6351558574044741905,34.0
132164,308144,,,630,Bieżanowska,24,Kurdwanów P+R,2018-07-31 23:56:00,6.352185e+18,6351558574044950804,23.0


##  🕵️‍♂️ Sprawdźmy dane

In [None]:
df_train.sample()

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num
109539,198055,0,2018-07-27 13:14:18,136,Bronowice Wiadukt,24,Bronowice Małe,2018-07-27 13:14:00,6.352185e+18,6351558574046752009,27.0


## Tworzymy cechy  (`feature engineering`)

In [None]:
df["planned_time"] = pd.to_datetime( df["planned_time"] )
df["planned_time_hour"] = df["planned_time"].dt.hour
df["stop_name_cat"] = df["stop_name"].factorize()[0]
df["direction_cat"] = df["direction"].factorize()[0]



# Dodajmy inne cechy

df["planned_time_minute"] = df["planned_time"].dt.minute
df["weekday"] = df["planned_time"].dt.dayofweek #Dodajemy dzień tygodnia
df["weekend"] = df["weekday"].apply(lambda x: True if x>4 else False).astype(int) # Sprawdzamy czy jest weekend, czy nie i tworzymy cechę
df["rush_hours"] = df["planned_time"].dt.hour.apply(lambda x: True if 5<=x<=10 else False).astype(int) #rush_hours - godziny szczytu

### Przystanki

In [None]:
def df_group_delay(df_train, groupby_feats):
    agg_params = {
        "mean_{}_delay".format("_".join(groupby_feats)): ("delay", "mean"),
        "median_{}_delay".format("_".join(groupby_feats)): ("delay", "median"),
        "count_{}_delay".format("_".join(groupby_feats)): ("delay", "count"),
        "std_{}_delay".format("_".join(groupby_feats)): ("delay", "std"),
        "count_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: len([x for x in vals if x == 0]) ),
        "prob_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: np.mean([x == 0 for x in vals]) ),
    }
    
    return df_train[groupby_feats + ["delay"]].groupby(groupby_feats).agg(
        **agg_params
    ).reset_index()

Teraz użyjmy `df_group_delay` i to będzie to, co osiągnęliśmy ostanio.

In [None]:
df_tmp = df_group_delay(df_train, ["stop_name"])
if "mean_stopname_delay" not in df:
    df = pd.merge(df, df_tmp, on="stop_name", how="left")

Teraz dodajmy jeszcze kierunek jazdy (czyli `direction`). Ten sam przystanek, ale możesz jechać w różne strony, a więc będą to różne przypadki.

In [None]:
df_tmp = df_group_delay(df_train, ["stop_name", "direction"])
if "mean_stopname_direction_delay" not in df:
    df = pd.merge(df, df_tmp, on=["stop_name", "direction"], how="left")

## Wybieramy cechy

Dodatkowo możemy niektóre cechy zignorować, bo jak sprawdzisz je dokładniej, to każe się, że nie wnoszą wiele.

In [None]:
feats = df.select_dtypes("number").columns
black_list = ["id", "delay", "vehicle_id", "trip_id"]
feats = [x for x in feats if x not in black_list]
feats

['stop',
 'number',
 'seq_num',
 'planned_time_hour',
 'stop_name_cat',
 'direction_cat',
 'planned_time_minute',
 'weekday',
 'weekend',
 'rush_hours',
 'mean_stop_name_delay',
 'median_stop_name_delay',
 'count_stop_name_delay',
 'std_stop_name_delay',
 'count_zeros_stop_name_delay',
 'prob_zeros_stop_name_delay',
 'mean_stop_name_direction_delay',
 'median_stop_name_direction_delay',
 'count_stop_name_direction_delay',
 'std_stop_name_direction_delay',
 'count_zeros_stop_name_direction_delay',
 'prob_zeros_stop_name_direction_delay']

## Przygotujemy `X` i `y`

In [None]:
df_train = df[ df["delay"].notnull() ].copy()
df_test = df[ df["delay"].isnull() ].copy()

X_train = df_train[feats].fillna(-1).values
y_train = df_train["delay"].values
X_test = df_test[feats].fillna(-1).values

## Trenujemy i prognozujemy 

In [None]:
#model = xgb.XGBRegressor(max_depth=10, n_estimators=550, learning_rate=0.01, iterations = 10000, random_state=0, tree_method='gpu_hist', gpu_id=0)
#%%time
model = cb.CatBoostRegressor(iterations=50000, max_depth=10, learning_rate=0.05, loss_function="MAE", task_type="GPU", devices='0')
#model = cb.CatBoostRegressor(loss_function="MAE", task_type="GPU", devices='0:1')
#model = cb.CatBoostRegressor(iterations=15000, learning_rate=0.03, loss_function="MAE")
scores = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_absolute_error")
np.mean(scores), np.std(scores)
#(-49.83719482511818, 0.8904963033404918) # XGBoost
#CatBoost z MAE ok 45 i 1.0
#(-49.14667958304572, 0.696173685716733) # CatBoost z RMSE
#(-46.90319682413995, 1.830070445891651) #CatBoost z MAE i cechą weekday ale bez weekend
#(-48.53960690077103, 0.936604318545953) #XBoost starter5
#(-48.4702523287905, 1.895779162618849) #CatBoost z MAE i cechami weekday i rush_hours (45.04419 na Kaggle)
#(-44.64200758573867, 1.1749593071241966) #CatBoost z MAE i tylko cechy ze startera 5

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
45000:	learn: 26.2822803	total: 5m 57s	remaining: 39.8s
45001:	learn: 26.2822590	total: 5m 58s	remaining: 39.8s
45002:	learn: 26.2822526	total: 5m 58s	remaining: 39.8s
45003:	learn: 26.2820864	total: 5m 58s	remaining: 39.7s
45004:	learn: 26.2819841	total: 5m 58s	remaining: 39.7s
45005:	learn: 26.2818520	total: 5m 58s	remaining: 39.7s
45006:	learn: 26.2815153	total: 5m 58s	remaining: 39.7s
45007:	learn: 26.2814769	total: 5m 58s	remaining: 39.7s
45008:	learn: 26.2812894	total: 5m 58s	remaining: 39.7s
45009:	learn: 26.2812575	total: 5m 58s	remaining: 39.7s
45010:	learn: 26.2800109	total: 5m 58s	remaining: 39.7s
45011:	learn: 26.2800024	total: 5m 58s	remaining: 39.7s
45012:	learn: 26.2799491	total: 5m 58s	remaining: 39.7s
45013:	learn: 26.2798660	total: 5m 58s	remaining: 39.7s
45014:	learn: 26.2798340	total: 5m 58s	remaining: 39.7s
45015:	learn: 26.2798106	total: 5m 58s	remaining: 39.6s
45016:	learn: 26.2794356	

(-57.570758576760845, 6.432189657514994)

Teraz wytrenujmy model na prognozowanie.

In [None]:
model = cb.CatBoostRegressor(iterations=100000, max_depth=10, learning_rate=0.05, loss_function="MAE", task_type="GPU", devices='0')
#model = xgb.XGBRegressor(max_depth=5, n_estimators=50, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred[ y_pred < 0 ] = 0
df_test["delay"] = y_pred

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
95000:	learn: 29.9655456	total: 13m 27s	remaining: 42.5s
95001:	learn: 29.9655427	total: 13m 27s	remaining: 42.5s
95002:	learn: 29.9654973	total: 13m 27s	remaining: 42.5s
95003:	learn: 29.9654774	total: 13m 27s	remaining: 42.5s
95004:	learn: 29.9654603	total: 13m 27s	remaining: 42.5s
95005:	learn: 29.9654490	total: 13m 27s	remaining: 42.5s
95006:	learn: 29.9654319	total: 13m 27s	remaining: 42.5s
95007:	learn: 29.9653126	total: 13m 27s	remaining: 42.4s
95008:	learn: 29.9652927	total: 13m 27s	remaining: 42.4s
95009:	learn: 29.9651592	total: 13m 27s	remaining: 42.4s
95010:	learn: 29.9651080	total: 13m 27s	remaining: 42.4s
95011:	learn: 29.9650796	total: 13m 27s	remaining: 42.4s
95012:	learn: 29.9650711	total: 13m 27s	remaining: 42.4s
95013:	learn: 29.9650682	total: 13m 27s	remaining: 42.4s
95014:	learn: 29.9650029	total: 13m 27s	remaining: 42.4s
95015:	learn: 29.9650001	total: 13m 27s	remaining: 42.4s
95016:	le

In [None]:
!mkdir -p ../output

## Ważność cech

In [None]:
eli5.show_weights(model, feature_names=feats)

Weight,Feature
0.2009,planned_time_hour
0.1046,planned_time_minute
0.0925,weekday
0.0767,number
0.0516,seq_num
0.0495,mean_stop_name_direction_delay
0.0449,direction_cat
0.0436,std_stop_name_direction_delay
0.0373,count_stop_name_direction_delay
0.0372,count_zeros_stop_name_direction_delay


##  Zapisujemy wynik  do .csv

Zapisz i wyślij to do Kaggle✔️. 

In [None]:
df_test[ ["id", "delay"] ].to_csv('../output/simple_catboost3.csv', index=False) 

In [None]:
feats

['stop',
 'number',
 'seq_num',
 'planned_time_hour',
 'stop_name_cat',
 'direction_cat',
 'planned_time_minute',
 'weekday',
 'weekend',
 'rush_hours',
 'mean_stop_name_delay',
 'median_stop_name_delay',
 'count_stop_name_delay',
 'std_stop_name_delay',
 'count_zeros_stop_name_delay',
 'prob_zeros_stop_name_delay',
 'mean_stop_name_direction_delay',
 'median_stop_name_direction_delay',
 'count_stop_name_direction_delay',
 'std_stop_name_direction_delay',
 'count_zeros_stop_name_direction_delay',
 'prob_zeros_stop_name_direction_delay']

In [None]:
df['datetime'].sample(10)

131343    2018-07-27 23:37:46
51488     2018-07-25 06:50:54
197886                    NaN
78212     2018-07-25 16:59:47
265512                    NaN
187854                    NaN
275252                    NaN
280783                    NaN
167504    2018-07-30 19:10:28
100205    2018-07-27 08:50:43
Name: datetime, dtype: object