In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import PreprocessingUtils as utils
from GeneralUtils import get_historical_weather

In [2]:
data = pd.read_parquet('../data/raw_data_delays.parquet')
gps = pd.read_csv('../data/station_gps.csv')

In [3]:
data = utils.prepare_raw_data(data)
data = pd.merge(data, gps, on='Stacja', how='left')

### station count
<ul>
    <li>full route station count</li>
    <li>station count on current station</li>
</ul>

In [4]:
data['station_count_on_curr_station'] = data.groupby(['pk', 'Relacja']).cumcount()
data['full_route_station_count'] = data.groupby(['pk', 'Relacja'])['Relacja'].transform('count')

### distances
<ul>
    <li>full route distance</li>
    <li>distance distance until current station</li>
    <li>distance from start station</li>
    <li>distance to final station</li>
    <li>distance distance from the nearest big city station</li>
</ul>

In [5]:
data = utils.count_distances(data, gps)

### date features and holidays

In [6]:
cols = ['arrival_on_time','departure_on_time']
for col in cols:
    data = utils.fix_dates(data, col)
    
data = utils.apply_date_features(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp[col_name] = new_dates


### longer stop duration

In [8]:
data['stop_duration'] = (data['departure_on_time'] - data['arrival_on_time']).dt.total_seconds()/60

### modeling

In [39]:
relacje = data[data['Stacja']=='Wrocław Główny']['Relacja'].unique()
example = data[data['Relacja'].isin(relacje)].reset_index(drop=True).copy()

### make ml target

In [40]:
example['ML_TARGET'] = np.select(
    [
        example['Opóźnienie przyjazdu']<=5,
        (example['Opóźnienie przyjazdu']>5) & (example['Opóźnienie przyjazdu']<=20),
        (example['Opóźnienie przyjazdu']>20) & (example['Opóźnienie przyjazdu']<=60),
        example['Opóźnienie przyjazdu']>60
     ],
    [
        0,
        1,
        2,
        3 
    ]
)

In [41]:
relation_dummies = pd.get_dummies(example['Relacja'], prefix='Relacja', dtype=float)
example = pd.concat([example.drop('Relacja', axis=1), relation_dummies], axis=1)

In [42]:
station_dummies = pd.get_dummies(example['near_city_station_name'], prefix='near_city_station_name', dtype=float)
example = pd.concat([example.drop('near_city_station_name', axis=1), station_dummies], axis=1)

In [43]:
example

Unnamed: 0,pk,Stacja,Opóźnienie przyjazdu,Opóźnienie odjazdu,arrival_on_time,departure_on_time,lat,lon,station_count_on_curr_station,full_route_station_count,...,near_city_station_name_Katowice,near_city_station_name_Kołobrzeg,near_city_station_name_Lublin Główny,near_city_station_name_Olsztyn Główny,near_city_station_name_Poznań Główny,near_city_station_name_Rzeszów Główny,near_city_station_name_Szczecin Główny,near_city_station_name_Warszawa Centralna,near_city_station_name_Wrocław Główny,near_city_station_name_Łódź Widzew
0,4,Ostrów Wielkopolski,0,0,2022-12-01 05:23:00,2022-12-01 05:23:00,51.649247,17.805625,0,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4,Topola-Osiedle,0,0,2022-12-01 05:28:30,2022-12-01 05:29:00,51.612860,17.761920,1,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4,Tarchały Wielkie,0,0,2022-12-01 05:33:00,2022-12-01 05:33:30,51.587130,17.701904,2,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,Odolanów,0,0,2022-12-01 05:36:30,2022-12-01 05:37:00,51.574752,17.667913,3,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,Garki,0,0,2022-12-01 05:40:30,2022-12-01 05:41:00,51.540548,17.648767,4,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230343,52125,Wałbrzych Główny,7,7,2023-04-23 07:45:30,2023-04-23 07:46:30,50.743393,16.280690,2,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
230344,52125,Wałbrzych Miasto,8,8,2023-04-23 07:56:00,2023-04-23 07:58:00,50.786229,16.285234,3,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
230345,52125,Świebodzice,9,8,2023-04-23 08:10:24,2023-04-23 08:12:24,50.860193,16.332831,4,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
230346,52125,Jaworzyna Śląska,5,5,2023-04-23 08:19:00,2023-04-23 08:23:00,50.912468,16.428852,5,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [48]:
y = example['ML_TARGET']
X = example.iloc[:,8:].drop('ML_TARGET', axis=1)

In [50]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
model = RandomForestClassifier(random_state=123)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
cv.fit(X_train, y_train)

### weather features

In [11]:
data

Unnamed: 0,pk,Relacja,Stacja,Opóźnienie przyjazdu,Opóźnienie odjazdu,arrival_on_time,departure_on_time,lat,lon,station_count_on_curr_station,full_route_station_count,distance_to_near_city_station,near_city_station_name,distance_to_prev_station,distance_from_start,distance_to_final,full_route_distance,stop_duration
0,1,Przeworsk - Rzeszów Główny,Przeworsk,0,0,2022-12-01 06:55:00,2022-12-01 06:55:00,50.067161,22.503232,0,10,35.612095,Rzeszów Główny,0.000000,0.000000,36.484418,36.484418,0.0
1,1,Przeworsk - Rzeszów Główny,Grzęska,0,0,2022-12-01 06:58:30,2022-12-01 06:59:00,50.080037,22.452564,1,10,32.152599,Rzeszów Główny,3.889077,3.889077,31.639766,36.484418,0.5
2,1,Przeworsk - Rzeszów Główny,Rogóżno koło Łańcuta,0,0,2022-12-01 07:03:00,2022-12-01 07:03:30,50.083573,22.362513,2,10,25.854248,Rzeszów Główny,6.437439,10.326516,28.265814,36.484418,0.5
3,1,Przeworsk - Rzeszów Główny,Kosina,0,0,2022-12-01 07:06:30,2022-12-01 07:07:00,50.083570,22.322846,3,10,23.070959,Rzeszów Główny,2.830260,13.156776,22.842960,36.484418,0.5
4,1,Przeworsk - Rzeszów Główny,Głuchów,0,0,2022-12-01 07:09:30,2022-12-01 07:10:00,50.083469,22.279077,4,10,20.013113,Rzeszów Główny,3.122964,16.279739,20.043487,36.484418,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138205,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Kraków Bieżanów,0,0,2023-04-23 08:52:12,2023-04-23 08:52:42,50.021694,20.029257,10,15,76.751356,Katowice,2.235773,19.677470,6.634256,24.366635,0.5
1138206,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Kraków Bieżanów Drożdżownia,0,0,2023-04-23 08:55:00,2023-04-23 08:55:30,50.010696,20.035373,11,15,77.593166,Katowice,1.298655,20.976125,4.328076,24.366635,0.5
1138207,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Wieliczka Bogucice,0,0,2023-04-23 08:57:24,2023-04-23 08:57:54,49.998474,20.036696,12,15,78.180632,Katowice,1.362311,22.338436,3.617016,24.366635,0.5
1138208,52128,Kraków Lotnisko - Wieliczka Rynek-Kopalnia,Wieliczka Park,0,0,2023-04-23 09:00:06,2023-04-23 09:00:36,49.989105,20.049334,13,15,79.411243,Katowice,1.378941,23.717377,1.902019,24.366635,0.5


In [2]:
x = get_historical_weather(50.067161, 22.503232, '2022-12-01')

In [3]:
x

{'error': {'code': 2008, 'message': 'API key has been disabled.'}}