In [1]:
import datetime
import numpy as np
import pandas as pd

In [5]:
class NJCleaner():
    def __init__(self, path):
        self.data = pd.read_csv(path)

    def order_by_scheduled_time(self):
        return self.data.sort_values(by=['scheduled_time'])

    def drop_columns_and_nan(self):
        return self.data.drop(columns=['from', 'to']).dropna()

    def convert_date_to_day(self):
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data['day'] = self.data['date'].dt.day_name()
        return self.data.drop(columns=['date'])

    def convert_scheduled_time_to_part_of_the_day(self):
        self.data['scheduled_time'] = pd.to_datetime(self.data['scheduled_time'])
        hours = self.data['scheduled_time'].dt.hour

        self.data['part_of_the_day'] = hours.apply(lambda x:
            'late_night' if 0 <= x < 4 else
            'early_morning' if 4 <= x < 8 else
            'morning' if 8 <= x < 12 else
            'afternoon' if 12 <= x < 16 else
            'evening' if 16 <= x < 20 else
            'night')

        return self.data.drop(columns=['scheduled_time'])

    def convert_delay(self):
        self.data['delay'] = np.where(self.data['delay_minutes'] >= 5.0, 1, 0)
        return self.data

    def drop_unnecessary_columns(self):
        return self.data.drop(columns=['train_id', 'actual_time', 'delay_minutes'])

    def save_first_60k(self, path):
        self.data.head(60000).to_csv(path)

    def prep_df(self, path='data/NJ.csv'):
        self.data = self.order_by_scheduled_time()
        self.data = self.drop_columns_and_nan()
        self.data = self.convert_date_to_day()
        self.data = self.convert_scheduled_time_to_part_of_the_day()
        self.data = self.convert_delay()
        self.data = self.drop_unnecessary_columns()
        self.data = self.save_first_60k(path)






nj = NJCleaner('C:/bevadat/BEVADAT2022232/HAZI/HAZI06/2018_03.csv')
data = nj.prep_df('C:/bevadat/BEVADAT2022232/HAZI/HAZI06/NJ.csv')
data