In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils.PreprocessingUtils as utils
from utils.GeneralUtils import get_historical_weather

In [2]:
data = pd.read_parquet('../data/raw_data_delays.parquet')
gps = pd.read_csv('../data/station_gps.csv')

In [3]:
data = utils.prepare_raw_data(data)
data = pd.merge(data, gps, on='Stacja', how='left')

### station count
<ul>
    <li>full route station count</li>
    <li>station count on current station</li>
</ul>

In [4]:
data['station_count_on_curr_station'] = data.groupby(['pk', 'Relacja']).cumcount()
data['full_route_station_count'] = data.groupby(['pk', 'Relacja'])['Relacja'].transform('count')

### distances
<ul>
    <li>full route distance</li>
    <li>distance distance until current station</li>
    <li>distance from start station</li>
    <li>distance to final station</li>
    <li>distance distance from the nearest big city station</li>
</ul>

In [5]:
data = utils.count_distances(data, gps)

### date features and holidays

In [6]:
cols = ['arrival_on_time','departure_on_time']
for col in cols:
    data = utils.fix_dates(data, col)
    
data = utils.apply_date_features(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp[col_name] = new_dates


### longer stop duration

In [7]:
data['stop_duration'] = (data['departure_on_time'] - data['arrival_on_time']).dt.total_seconds()/60
for i in range(1, 7):
    data[f'stop_duration_lag{i}'] = data.groupby(['pk','Relacja'])['stop_duration'].transform(lambda x: x.shift(i))
    data[f'stop_duration_lag{i}'] = data[f'stop_duration_lag{i}'].fillna(-1)

In [9]:
# data.to_parquet('../data/prepared_data.parquet')

In [2]:
data = pd.read_parquet('../data/prepared_data.parquet')

In [3]:
data['ML_TARGET'] = np.select(
    [
        data['Opóźnienie przyjazdu']<=5,
        (data['Opóźnienie przyjazdu']>5) & (data['Opóźnienie przyjazdu']<=20),
        (data['Opóźnienie przyjazdu']>20) & (data['Opóźnienie przyjazdu']<=60),
        data['Opóźnienie przyjazdu']>60
     ],
    [
        0,
        1,
        2,
        3 
    ]
)

In [4]:
near_city_names_dummies = pd.get_dummies(data['near_city_station_name'], prefix='near_city', dtype=float)
data = pd.concat([data.drop('near_city_station_name', axis=1), near_city_names_dummies], axis=1)

### modeling

In [8]:
y = data['ML_TARGET']
X = data.iloc[:,9:].drop('ML_TARGET', axis=1)

In [29]:
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
model = CatBoostClassifier(random_state=123, max_depth=10)
model.fit(X_train, y_train)

Learning rate set to 0.112008
0:	learn: 1.1740528	total: 2.59s	remaining: 43m 11s
1:	learn: 1.0305080	total: 4.32s	remaining: 35m 56s
2:	learn: 0.9265809	total: 5.94s	remaining: 32m 53s
3:	learn: 0.8471693	total: 7.71s	remaining: 32m
4:	learn: 0.7836545	total: 9.04s	remaining: 29m 59s
5:	learn: 0.7322920	total: 10.1s	remaining: 27m 59s
6:	learn: 0.6896603	total: 11.2s	remaining: 26m 27s
7:	learn: 0.6542049	total: 12.3s	remaining: 25m 23s
8:	learn: 0.6246839	total: 13.4s	remaining: 24m 30s
9:	learn: 0.5998032	total: 14.4s	remaining: 23m 48s
10:	learn: 0.5790403	total: 15.5s	remaining: 23m 11s
11:	learn: 0.5606539	total: 16.6s	remaining: 22m 43s
12:	learn: 0.5449211	total: 17.6s	remaining: 22m 15s
13:	learn: 0.5315532	total: 18.6s	remaining: 21m 46s
14:	learn: 0.5194860	total: 19.6s	remaining: 21m 24s
15:	learn: 0.5097106	total: 20.5s	remaining: 21m 1s
16:	learn: 0.5008574	total: 21.5s	remaining: 20m 43s
17:	learn: 0.4928348	total: 22.5s	remaining: 20m 28s
18:	learn: 0.4861271	total: 23.

<catboost.core.CatBoostClassifier at 0x1b537b73370>

In [12]:
y_pred = model.predict(X_test)

In [20]:
y_pred_proba = model.predict_proba(X_test)

array([[6.53007620e-01, 3.37427714e-01, 9.43006358e-03, 1.34602220e-04],
       [9.24055425e-01, 7.23389445e-02, 3.58032895e-03, 2.53013911e-05],
       [3.66635618e-01, 5.78540136e-01, 5.32133238e-02, 1.61092303e-03],
       ...,
       [9.22697666e-01, 7.41128378e-02, 3.12303960e-03, 6.64564618e-05],
       [9.69816373e-01, 2.90746800e-02, 1.10386734e-03, 5.08011517e-06],
       [9.24781698e-01, 7.39243872e-02, 1.26977663e-03, 2.41376743e-05]])

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94    193326
           1       0.70      0.23      0.35     26277
           2       0.69      0.28      0.40      6840
           3       0.82      0.32      0.46      1199

    accuracy                           0.88    227642
   macro avg       0.77      0.46      0.54    227642
weighted avg       0.86      0.88      0.85    227642


In [22]:
roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

0.9327520129876058

In [23]:
pd.DataFrame({
    'feature': X.columns,
    'importance': model.get_feature_importance()
})

Unnamed: 0,feature,importance
0,station_count_on_curr_station,2.237888
1,full_route_station_count,12.651376
2,distance_to_near_city_station,3.609216
3,distance_to_prev_station,1.576187
4,distance_from_start,3.958029
5,distance_to_final,3.030528
6,full_route_distance,8.606147
7,days_until_christmas,7.343056
8,weekday_Friday,1.30278
9,weekday_Monday,1.532559


In [24]:
from sklearn.preprocessing import LabelBinarizer

In [25]:
lb = LabelBinarizer()
y_test_lb = y_test.fit_transform(y_test)
y_test = y_test.fit_transform(y_test)

In [30]:
y_test0 = np.where(y_test==0, 0, 1)
y_test1 = np.where(y_test==1, 1, 0)
y_test2 = np.where(y_test==2, 1, 0)
y_test3 = np.where(y_test==3, 1, 0)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[0])

ValueError: multiclass format is not supported

In [31]:
model.save_model('models/first_model')

### weather features

In [11]:
data

Unnamed: 0,pk,Relacja,Stacja,Opóźnienie przyjazdu,Opóźnienie odjazdu,arrival_on_time,departure_on_time,lat,lon,station_count_on_curr_station,full_route_station_count,distance_to_near_city_station,near_city_station_name,distance_to_prev_station,distance_from_start,distance_to_final,full_route_distance,stop_duration
0,1,Przeworsk - Rzeszów Główny,Przeworsk,0,0,2022-12-01 06:55:00,2022-12-01 06:55:00,50.067161,22.503232,0,10,35.612095,Rzeszów Główny,0.000000,0.000000,36.484418,36.484418,0.0
1,1,Przeworsk - Rzeszów Główny,Grzęska,0,0,2022-12-01 06:58:30,2022-12-01 06:59:00,50.080037,22.452564,1,10,32.152599,Rzeszów Główny,3.889077,3.889077,31.639766,36.484418,0.5
2,1,Przeworsk - Rzeszów Główny,Rogóżno koło Łańcuta,0,0,2022-12-01 07:03:00,2022-12-01 07:03:30,50.083573,22.362513,2,10,25.854248,Rzeszów Główny,6.437439,10.326516,28.265814,36.484418,0.5
3,1,Przeworsk - Rzeszów Główny,Kosina,0,0,2022-12-01 07:06:30,2022-12-01 07:07:00,50.083570,22.322846,3,10,23.070959,Rzeszów Główny,2.830260,13.156776,22.842960,36.484418,0.5
4,1,Przeworsk - Rzeszów Główny,Głuchów,0,0,2022-12-01 07:09:30,2022-12-01 07:10:00,50.083469,22.279077,4,10,20.013113,Rzeszów Główny,3.122964,16.279739,20.043487,36.484418,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138205,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Kraków Bieżanów,0,0,2023-04-23 08:52:12,2023-04-23 08:52:42,50.021694,20.029257,10,15,76.751356,Katowice,2.235773,19.677470,6.634256,24.366635,0.5
1138206,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Kraków Bieżanów Drożdżownia,0,0,2023-04-23 08:55:00,2023-04-23 08:55:30,50.010696,20.035373,11,15,77.593166,Katowice,1.298655,20.976125,4.328076,24.366635,0.5
1138207,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Wieliczka Bogucice,0,0,2023-04-23 08:57:24,2023-04-23 08:57:54,49.998474,20.036696,12,15,78.180632,Katowice,1.362311,22.338436,3.617016,24.366635,0.5
1138208,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Wieliczka Park,0,0,2023-04-23 09:00:06,2023-04-23 09:00:36,49.989105,20.049334,13,15,79.411243,Katowice,1.378941,23.717377,1.902019,24.366635,0.5


In [15]:
x = get_historical_weather(50.067161, 22.503232, '2022-12-01')

In [16]:
x

{'error': {'code': 2008, 'message': 'API key has been disabled.'}}