In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import utils.PreprocessingUtils as utils
from haversine import haversine, Unit

#### load data

In [2]:
data = pd.read_parquet('../data/raw_data_delays.parquet')
stations_gps = gpd.read_file('../data/spatial_data/stations_gps.shp', encoding='utf-8')

#### prepare raw data and join stations gps

In [3]:
data = utils.prepare_raw_data(data)
data = pd.merge(data, stations_gps, on='Stacja', how='left')

In [4]:
data = utils.geometry_point_to_lat_lon(data)

### station count
<ul>
    <li>full route station count</li>
    <li>station count on current station</li>
</ul>

In [5]:
data['station_count_on_curr_station'] = data.groupby(['pk', 'Relacja']).cumcount()
# data['full_route_station_count'] = data.groupby(['pk', 'Relacja'])['Relacja'].transform('count')

#### get routes info

In [6]:
routes_data = pd.read_parquet('../data/routes_data_all.parquet').iloc[:,:8]

In [7]:
dfs_out = []
pks = data['pk'].unique()

for pk in pks:
    temp_df = data[data['pk']==pk].copy().reset_index(drop=True)

    key1 = temp_df.groupby('Relacja')['Stacja'].agg(utils.unique_list_preserve_order).index.item()
    key2 = ', '.join(temp_df.groupby('Relacja')['Stacja'].agg(utils.unique_list_preserve_order)[0])
    key = f'{key1}_{key2}'
    temp_df['key'] = key

    temp_df['prev_stations'] = temp_df['Stacja'].shift(1)
    temp_df['next_stations'] = temp_df['Stacja'].shift(-1)

    temp_df_merged = temp_df.merge(routes_data, how='left', on=['key','Relacja', 'Stacja', 'lat', 'lon', 'prev_stations', 'next_stations'])
    dfs_out.append(temp_df_merged)

df_out = pd.concat(dfs_out, ignore_index=True)

In [8]:
# checkpoint = df_out.copy()

In [13]:
keys_to_remove = df_out[df_out['distances'].isna()]['key'].unique()
df_out = df_out[~df_out['key'].isin(keys_to_remove)].reset_index(drop=True)

### distances
<ul>
    <li>distance distance until current station</li>
    <li>distance from start station</li>
    <li>distance to final station</li>
    <li>distance distance from the nearest big city station</li>
</ul>

In [14]:
df_out['cumsum_distances'] = df_out.groupby('pk')['distances'].cumsum()

In [15]:
dfs_out = []
pks = df_out['pk'].unique()

for pk in pks:
    temp_df = df_out[df_out['pk']==pk].copy().reset_index(drop=True)
    temp_df['distance_to_finish'] = np.array(temp_df['cumsum_distances'].iloc[::-1])
    dfs_out.append(temp_df)
    
df_out = pd.concat(dfs_out, ignore_index=True)

In [16]:
big_city_names = pd.read_csv('../data/big_cities_dict.csv', sep=';').iloc[:,0].values
big_city_stations = stations_gps[stations_gps['Stacja'].isin(big_city_names)].reset_index(drop=True)
big_city_stations = utils.geometry_point_to_lat_lon(big_city_stations)

### spatial data

In [13]:
data = utils.join_spatial_data(data, voivodeship_borders, county_borders, borough_borders)

### date features and holidays

In [6]:
cols = ['arrival_on_time','departure_on_time']
for col in cols:
    data = utils.fix_dates(data, col)
    
data = utils.apply_date_features(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp[col_name] = new_dates


### longer stop duration

In [7]:
data['stop_duration'] = (data['departure_on_time'] - data['arrival_on_time']).dt.total_seconds()/60
for i in range(1, 7):
    data[f'stop_duration_lag{i}'] = data.groupby(['pk','Relacja'])['stop_duration'].transform(lambda x: x.shift(i))
    data[f'stop_duration_lag{i}'] = data[f'stop_duration_lag{i}'].fillna(-1)

### apply weather data

In [10]:
weather = pd.read_parquet('../data/weather_data.parquet')

# weather.drop(columns=['stations','source','tzoffset','datetimeEpoch'], inplace=True)
# weather['datetime_merge'] = pd.to_datetime(weather['date'].astype(str) + ' ' + weather['datetime'].astype(str))

In [9]:
data['datetime_merge'] = data['arrival_on_time'].dt.floor('H')

  data['datetime_merge'] = data['arrival_on_time'].dt.floor('H')


In [10]:
data = pd.merge(
    data,
    weather,
    how='left',
    on=['lat', 'lon', 'datetime_merge']
).drop(columns=['date','datetime_merge', 'datetime'])

### fixing weather cols

#### snow, windgust, visibility, solarradiation, solarenergy, uvindex

In [11]:
cols_to_fix = ['snow','windgust','visibility','solarradiation','solarenergy','uvindex']
for col in cols_to_fix:
    data[col] = data[col].fillna(-1)

#### preciptype

In [12]:
data['preciptype'] = data['preciptype'].fillna('None')
data['preciptype'] = data['preciptype'].apply(lambda x: '_'.join(map(str, x)) if isinstance(x, np.ndarray) else x)
preciptype_dummies = pd.get_dummies(data['preciptype'], prefix='preciptype', dtype=float)
data = pd.concat([data.drop('preciptype', axis=1), preciptype_dummies], axis=1)

#### conditions

In [13]:
data['conditions'] = data['conditions'].str.replace(', ', '_').str.replace(',', '_').str.replace(' ', '_')
condition_dummies = pd.get_dummies(data['conditions'], dtype=float, prefix="conditions")
data = pd.concat([data.drop('conditions', axis=1), condition_dummies], axis=1)

#### icon

In [14]:
icon_dummies = pd.get_dummies(data['icon'], dtype=float, prefix="icon")
data = pd.concat([data.drop('icon', axis=1), icon_dummies], axis=1)

In [16]:
# data.to_parquet('../prepared_data_with_weather.parquet')

In [12]:
# data.to_parquet('../data/data_with_weather.parquet')

In [39]:
# data = pd.read_parquet('../data/data_with_weather.parquet')

In [9]:
# data.to_parquet('../data/prepared_data.parquet')

In [3]:
# data = pd.read_parquet('../data/prepared_data.parquet')

In [18]:
data['ML_TARGET'] = np.select(
    [
        data['Opóźnienie przyjazdu']<=5,
        (data['Opóźnienie przyjazdu']>5) & (data['Opóźnienie przyjazdu']<=20),
        (data['Opóźnienie przyjazdu']>20) & (data['Opóźnienie przyjazdu']<=60),
        data['Opóźnienie przyjazdu']>60
     ],
    [
        0,
        1,
        2,
        3 
    ]
)

In [19]:
near_city_names_dummies = pd.get_dummies(data['near_city_station_name'], prefix='near_city', dtype=float)
data = pd.concat([data.drop('near_city_station_name', axis=1), near_city_names_dummies], axis=1)

### modeling

In [20]:
y = data['ML_TARGET']
X = data.iloc[:,9:].drop('ML_TARGET', axis=1)

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
model = CatBoostClassifier(random_state=123, max_depth=10)
model.fit(X_train, y_train)

Learning rate set to 0.112008
0:	learn: 1.1736322	total: 1.66s	remaining: 27m 33s
1:	learn: 1.0322685	total: 3.07s	remaining: 25m 32s
2:	learn: 0.9279770	total: 4.47s	remaining: 24m 45s
3:	learn: 0.8477425	total: 5.93s	remaining: 24m 35s
4:	learn: 0.7847109	total: 7.45s	remaining: 24m 43s
5:	learn: 0.7330908	total: 8.93s	remaining: 24m 38s
6:	learn: 0.6907860	total: 10.5s	remaining: 24m 47s
7:	learn: 0.6553453	total: 11.9s	remaining: 24m 38s
8:	learn: 0.6255994	total: 13.5s	remaining: 24m 46s
9:	learn: 0.6005107	total: 15s	remaining: 24m 49s
10:	learn: 0.5789129	total: 16.5s	remaining: 24m 44s
11:	learn: 0.5603227	total: 18s	remaining: 24m 39s
12:	learn: 0.5445023	total: 19.5s	remaining: 24m 38s
13:	learn: 0.5310053	total: 20.9s	remaining: 24m 29s
14:	learn: 0.5192207	total: 22.3s	remaining: 24m 25s
15:	learn: 0.5091604	total: 23.8s	remaining: 24m 20s
16:	learn: 0.5008856	total: 25.1s	remaining: 24m 13s
17:	learn: 0.4927972	total: 26.6s	remaining: 24m 10s
18:	learn: 0.4852900	total: 28

<catboost.core.CatBoostClassifier at 0x29247c79030>

In [23]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94    193648
           1       0.76      0.27      0.40     26044
           2       0.78      0.35      0.48      6711
           3       0.91      0.47      0.62      1241

    accuracy                           0.89    227644
   macro avg       0.84      0.52      0.61    227644
weighted avg       0.88      0.89      0.86    227644



In [25]:
roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

0.944761744795898

In [27]:
pd.DataFrame({
    'feature': X.columns,
    'importance': model.get_feature_importance()
}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1,full_route_station_count,11.04968
15,hour_angle_sin,9.001687
6,full_route_distance,8.080405
39,pressure,6.175209
38,winddir,4.401237
4,distance_from_start,3.939657
17,monthday_angle_sin,3.657513
42,solarradiation,3.438269
16,weekday_angle_sin,3.384708
30,humidity,3.05397


In [24]:
from sklearn.preprocessing import LabelBinarizer

In [25]:
lb = LabelBinarizer()
y_test_lb = y_test.fit_transform(y_test)
y_test = y_test.fit_transform(y_test)

In [30]:
y_test0 = np.where(y_test==0, 0, 1)
y_test1 = np.where(y_test==1, 1, 0)
y_test2 = np.where(y_test==2, 1, 0)
y_test3 = np.where(y_test==3, 1, 0)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[0])

ValueError: multiclass format is not supported

In [31]:
model.save_model('models/first_model')

### weather features

In [11]:
data

Unnamed: 0,pk,Relacja,Stacja,Opóźnienie przyjazdu,Opóźnienie odjazdu,arrival_on_time,departure_on_time,lat,lon,station_count_on_curr_station,full_route_station_count,distance_to_near_city_station,near_city_station_name,distance_to_prev_station,distance_from_start,distance_to_final,full_route_distance,stop_duration
0,1,Przeworsk - Rzeszów Główny,Przeworsk,0,0,2022-12-01 06:55:00,2022-12-01 06:55:00,50.067161,22.503232,0,10,35.612095,Rzeszów Główny,0.000000,0.000000,36.484418,36.484418,0.0
1,1,Przeworsk - Rzeszów Główny,Grzęska,0,0,2022-12-01 06:58:30,2022-12-01 06:59:00,50.080037,22.452564,1,10,32.152599,Rzeszów Główny,3.889077,3.889077,31.639766,36.484418,0.5
2,1,Przeworsk - Rzeszów Główny,Rogóżno koło Łańcuta,0,0,2022-12-01 07:03:00,2022-12-01 07:03:30,50.083573,22.362513,2,10,25.854248,Rzeszów Główny,6.437439,10.326516,28.265814,36.484418,0.5
3,1,Przeworsk - Rzeszów Główny,Kosina,0,0,2022-12-01 07:06:30,2022-12-01 07:07:00,50.083570,22.322846,3,10,23.070959,Rzeszów Główny,2.830260,13.156776,22.842960,36.484418,0.5
4,1,Przeworsk - Rzeszów Główny,Głuchów,0,0,2022-12-01 07:09:30,2022-12-01 07:10:00,50.083469,22.279077,4,10,20.013113,Rzeszów Główny,3.122964,16.279739,20.043487,36.484418,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138205,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Kraków Bieżanów,0,0,2023-04-23 08:52:12,2023-04-23 08:52:42,50.021694,20.029257,10,15,76.751356,Katowice,2.235773,19.677470,6.634256,24.366635,0.5
1138206,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Kraków Bieżanów Drożdżownia,0,0,2023-04-23 08:55:00,2023-04-23 08:55:30,50.010696,20.035373,11,15,77.593166,Katowice,1.298655,20.976125,4.328076,24.366635,0.5
1138207,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Wieliczka Bogucice,0,0,2023-04-23 08:57:24,2023-04-23 08:57:54,49.998474,20.036696,12,15,78.180632,Katowice,1.362311,22.338436,3.617016,24.366635,0.5
1138208,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Wieliczka Park,0,0,2023-04-23 09:00:06,2023-04-23 09:00:36,49.989105,20.049334,13,15,79.411243,Katowice,1.378941,23.717377,1.902019,24.366635,0.5


In [15]:
x = get_historical_weather(50.067161, 22.503232, '2022-12-01')

In [16]:
x

{'error': {'code': 2008, 'message': 'API key has been disabled.'}}