## Libraries


In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import math
from utils import preprocessing as prep

import warnings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

## Loading data

In [2]:
# small dataframe, 1 week of data
df_full_small = pd.read_csv('../data/Data_20230306_20230312.csv', low_memory = False)

In [3]:
# large dataframe, 4 months of data
df1 = pd.read_csv('../data/Data_20230101_20230131.csv', low_memory=False)
df2 = pd.read_csv('../data/Data_20230201_20230228.csv', low_memory=False)
df3 = pd.read_csv('../data/Data_20230301_20230331.csv', low_memory=False)
df4 = pd.read_csv('../data/Data_20230401_20230430.csv', low_memory=False)

df_full_large = pd.concat([df1, df2, df3, df4], ignore_index=True)

## Preprocessing data

In [4]:
# preprocessing the small dataframe

# all current preprocessing steps
# set types
workable_df_small = df_full_small.copy()
workable_df_small = prep.set_types(workable_df_small)
workable_df_small = prep.calc_turnover(workable_df_small)
workable_df_small = prep.remove_unnecessary_rows(workable_df_small)
workable_df_small = prep.add_max_departure_time(workable_df_small)
workable_df_small = prep.calculate_signal_safe_delay(workable_df_small)
workable_df_small = prep.filter_outliers(workable_df_small)
workable_df_small = prep.categorise_combine_spilt(workable_df_small)
workable_df_small = prep.determine_daluren(workable_df_small)
workable_df_small = prep.days_and_hours(workable_df_small)
workable_df_small = prep.cyclical_encoder(workable_df_small, "DAY_OF_WEEK")
workable_df_small = prep.cyclical_encoder(workable_df_small, "HOUR")
workable_df_small = prep.calc_needed_turnover(workable_df_small)
workable_df_small = prep.add_cat_diff_turnover_time(workable_df_small)
workable_df_small = prep.remove_past_3_min(workable_df_small)

workable_df_small.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nan["PLAN_TURNOVER_TIME"] = plan_turnover_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nan["REALIZED_TURNOVER_TIME"] = real_turnover_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["PLAN_TURNOVER_TIME"] = plan_turnover_time
A value is trying to be set on a copy of a sli

Unnamed: 0,TRAFFIC_DATE,TRAINNUMBER,TRAINSERIE,TRAINSERIE_DIRECTION,STATION,ACTIVITYTYPE,PLAN_DATETIME,REALIZED_DATETIME,DELAY,TURNOVER_INDICATOR,PREVIOUS_TRAINNUMBER,COMBINE,SPLIT,ROLLINGSTOCK_TYPE,NUMBER_CARRIAGES,DRIVER_CHANGE,DEPARTURE_SIGNAL_SHOWS_SAFE,PLAN_TURNOVER_TIME,REALIZED_TURNOVER_TIME,MAX_DEPARTURE_TIME,PLAN_SIGNAL_SAFE_DELAY,REALIZED_SIGNAL_SAFE_DELAY,DALUREN,DAY_OF_WEEK,HOUR,DAY_OF_WEEK_sin,DAY_OF_WEEK_cos,HOUR_sin,HOUR_cos,NEEDED_PLAN_TURNOVER_TIME,NEEDED_REALIZED_TURNOVER_TIME,DIFF_TURNOVER_TIME,DIFF_TURNOVER_TIME_CAT
40,2023-03-06,519,500.0,O,Ut,V,2023-03-06 06:49:00+00:00,2023-03-06 06:49:38+00:00,38.0,1,,0,0,ICM,7.0,1.0,2023-03-06 06:48:07+00:00,420.0,446.0,,-53.0,-91.0,False,0,6,0.0,1.0,1.0,6.123234000000001e-17,442.0,468.0,-26.0,perfect
78,2023-03-06,523,500.0,O,Ut,V,2023-03-06 07:49:00+00:00,2023-03-06 07:48:58+00:00,-2.0,1,,0,0,ICM,8.0,1.0,2023-03-06 07:48:08+00:00,420.0,363.0,,-52.0,-50.0,False,0,7,0.0,1.0,0.965926,-0.258819,482.0,425.0,57.0,perfect
94,2023-03-06,524,500.0,E,Ut,V,2023-03-06 09:18:00+00:00,2023-03-06 09:18:04+00:00,4.0,1,,0,0,DDZ,6.0,0.0,2023-03-06 09:17:07+00:00,420.0,384.0,,-53.0,-57.0,True,0,9,0.0,1.0,0.707107,-0.7071068,476.0,440.0,36.0,perfect
106,2023-03-06,527,500.0,O,Ut,V,2023-03-06 08:49:00+00:00,2023-03-06 08:49:15+00:00,15.0,1,,0,0,ICM,6.0,1.0,2023-03-06 08:48:07+00:00,420.0,419.0,,-53.0,-68.0,False,0,8,0.0,1.0,0.866025,-0.5,465.0,464.0,1.0,perfect
134,2023-03-06,531,500.0,O,Ut,V,2023-03-06 09:49:00+00:00,2023-03-06 09:49:11+00:00,11.0,1,,0,0,ICM,7.0,1.0,2023-03-06 09:48:07+00:00,420.0,473.0,,-53.0,-64.0,True,0,9,0.0,1.0,0.707107,-0.7071068,469.0,522.0,-53.0,perfect


In [5]:
# preprocessing the large dataframe

# all current preprocessing steps
# set types
workable_df_large = df_full_large.copy()
workable_df_large = prep.set_types(workable_df_large)
workable_df_large = prep.calc_turnover(workable_df_large)
workable_df_large = prep.remove_unnecessary_rows(workable_df_large)
workable_df_large = prep.add_max_departure_time(workable_df_large)
workable_df_large = prep.calculate_signal_safe_delay(workable_df_large)
workable_df_large = prep.filter_outliers(workable_df_large)
workable_df_large = prep.categorise_combine_spilt(workable_df_large)
workable_df_large = prep.determine_daluren(workable_df_large)
workable_df_large = prep.days_and_hours(workable_df_large)
workable_df_large = prep.cyclical_encoder(workable_df_large, "DAY_OF_WEEK")
workable_df_large = prep.cyclical_encoder(workable_df_large, "HOUR")
workable_df_large = prep.calc_needed_turnover(workable_df_large)
workable_df_large = prep.add_cat_diff_turnover_time(workable_df_large)
workable_df_large = prep.remove_past_3_min(workable_df_large)

workable_df_large.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nan["PLAN_TURNOVER_TIME"] = plan_turnover_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nan["REALIZED_TURNOVER_TIME"] = real_turnover_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["PLAN_TURNOVER_TIME"] = plan_turnover_time
A value is trying to be set on a copy of a sli

Unnamed: 0,TRAFFIC_DATE,TRAINNUMBER,TRAINSERIE,TRAINSERIE_DIRECTION,STATION,ACTIVITYTYPE,DISTANCE_M,PLAN_DATETIME,REALIZED_DATETIME,DELAY,TURNOVER_INDICATOR,PREVIOUS_TRAINNUMBER,COMBINE,SPLIT,ROLLINGSTOCK_TYPE,NUMBER_CARRIAGES,DRIVER_CHANGE,DEPARTURE_SIGNAL_SHOWS_SAFE,PLAN_TURNOVER_TIME,REALIZED_TURNOVER_TIME,MAX_DEPARTURE_TIME,PLAN_SIGNAL_SAFE_DELAY,REALIZED_SIGNAL_SAFE_DELAY,DALUREN,DAY_OF_WEEK,HOUR,DAY_OF_WEEK_sin,DAY_OF_WEEK_cos,HOUR_sin,HOUR_cos,NEEDED_PLAN_TURNOVER_TIME,NEEDED_REALIZED_TURNOVER_TIME,DIFF_TURNOVER_TIME,DIFF_TURNOVER_TIME_CAT
59,2023-01-01,650,600.0,E,Ut,V,0.0,2023-01-01 15:48:00+00:00,2023-01-01 15:48:10+00:00,10.0,1,,0,0,ICM,7.0,1.0,2023-01-01 15:47:06+00:00,1200.0,1175.0,,-54.0,-64.0,True,6,15,-0.781831,0.62349,-0.7071068,-0.707107,1250.0,1225.0,25.0,perfect
66,2023-01-01,657,600.0,O,Ut,V,0.0,2023-01-01 16:19:00+00:00,2023-01-01 16:19:03+00:00,3.0,1,,0,0,ICM,6.0,1.0,2023-01-01 16:18:07+00:00,2130.0,2086.0,,-53.0,-56.0,True,6,16,-0.781831,0.62349,-0.8660254,-0.5,2187.0,2143.0,44.0,perfect
288,2023-01-01,536,500.0,E,Ut,V,0.0,2023-01-01 12:18:00+00:00,2023-01-01 12:18:21+00:00,21.0,1,,0,0,ICM,7.0,1.0,2023-01-01 12:17:07+00:00,420.0,393.0,,-53.0,-74.0,True,6,12,-0.781831,0.62349,1.224647e-16,-1.0,459.0,432.0,27.0,perfect
293,2023-01-01,540,500.0,E,Ut,V,0.0,2023-01-01 13:18:00+00:00,2023-01-01 13:18:00+00:00,0.0,1,,0,0,ICM,6.0,1.0,2023-01-01 13:17:07+00:00,420.0,391.0,,-53.0,-53.0,True,6,13,-0.781831,0.62349,-0.258819,-0.965926,480.0,451.0,29.0,perfect
305,2023-01-01,555,500.0,O,Ut,V,0.0,2023-01-01 15:49:00+00:00,2023-01-01 15:49:45+00:00,45.0,1,,0,0,ICM,6.0,1.0,2023-01-01 15:48:06+00:00,420.0,255.0,,-54.0,-99.0,True,6,15,-0.781831,0.62349,-0.7071068,-0.707107,435.0,270.0,165.0,too late


## Save preprocessed data

In [6]:
current_date = '23-05-2024'

In [7]:
# saving the small dataframe

# determining the name of the file
file_name_small = '../data/small_preprocessed_week_' + current_date + '.csv'

# saving the csv
workable_df_small.to_csv(file_name_small)

In [8]:
# saving the large dataframe

# determining the name of the file
file_name_large = '../data/large_preprocessed_week_' + current_date + '.csv'

# saving the csv
workable_df_large.to_csv(file_name_large)