In [3]:
import pandas as pd
import datetime

# ---- Clean column names and values ----
df = pd.read_csv("../CRIS_data/train_demand/demand_22222.csv")
df.columns = df.columns.str.strip().str.replace("'", "")
df['booking_date'] = pd.to_datetime(df['booking_date'])
df['journey_date'] = pd.to_datetime(df['journey_date'])

for col in ['trnno', 'brdpt_code', 'resupto_code', 'cls']:
    df[col] = df[col].astype(str).str.strip().str.replace("'", "")
df['PSGN'] = df['PSGN'].astype(int)

# ---- Holiday dictionary ----
holidays = {
    # 2023
    "26-01-2023": "Republic Day",
    "15-08-2023": "Independence Day",
    "02-10-2023": "Gandhi Jayanti",
    "25-12-2023": "Christmas",
    "12-11-2023": "Diwali",
    "07-03-2023": "Holi",
    "05-09-2023": "Eid",
    "15-09-2023": "Yashu Bday",

    # 2024
    "26-01-2024": "Republic Day",
    "15-08-2024": "Independence Day",
    "02-10-2024": "Gandhi Jayanti",
    "25-12-2024": "Christmas",
    "31-10-2024": "Diwali",
    "15-08-2024": "Raksha Bandhan",
    "26-08-2024": "Janmashtami",
    "25-03-2024": "Holi",

    # 2025
    "26-01-2025": "Republic Day",
    "15-08-2025": "Independence Day",
    "02-10-2025": "Gandhi Jayanti",
    "25-12-2025": "Christmas",
    "20-10-2025": "Diwali",
    "31-03-2025": "Holi",
    "05-09-2025": "Eid",
    "15-09-2025": "Yashu Bday"
}

# ---- Helper function for peak day ----
def check_peak(date_obj):
    date_str = date_obj.strftime("%d-%m-%Y")
    weekday = date_obj.weekday()
    is_peak = False

    # If holiday
    if date_str in holidays:
        is_peak = True

    # Weekend (Sat=5, Sun=6)
    if weekday >= 5:
        is_peak = True

    # Thursday before Friday holiday
    if weekday == 3:
        next_day = date_obj + datetime.timedelta(days=1)
        if next_day.strftime("%d-%m-%Y") in holidays and next_day.weekday() == 4:
            is_peak = True

    # Friday before Monday holiday
    if weekday == 4:
        next_monday = date_obj + datetime.timedelta(days=3)
        if next_monday.strftime("%d-%m-%Y") in holidays and next_monday.weekday() == 0:
            is_peak = True

    return int(is_peak)

# ---- Add new columns ----
df['weekday'] = df['journey_date'].dt.weekday

def season(month):
    if month in [5, 6]:   # May, June
        return 1  # Summer
    elif month in [11, 12, 1, 2]:  # Nov, Dec, Jan, Feb
        return 2  # Winter
    else:
        return 0  # Other

df['season'] = df['journey_date'].dt.month.apply(season)
df['is_peak_day'] = df['journey_date'].apply(check_peak)

df.head()


Unnamed: 0,booking_date,journey_date,trnno,brdpt_code,resupto_code,cls,PSGN,weekday,season,is_peak_day
0,2023-04-06,2023-08-01,22222,AGC,KYN,3A,6,1,0,0
1,2023-04-14,2023-08-01,22222,NZM,NK,3A,1,1,0,0
2,2023-05-02,2023-08-01,22222,NZM,CSMT,1A,4,1,0,0
3,2023-05-03,2023-08-01,22222,NZM,NK,2A,2,1,0,0
4,2023-05-14,2023-08-01,22222,NZM,NK,3A,1,1,0,0


In [4]:
print(df[df['season'] == 1].head())


       booking_date journey_date  trnno brdpt_code resupto_code  cls  PSGN  \
137726   2024-01-02   2024-05-01  22222       NZM          NK    2A      4   
137727   2024-01-02   2024-05-01  22222       BPL          CSMT  2A      2   
137728   2024-01-02   2024-05-01  22222       NZM          KYN   1A      2   
137729   2024-01-03   2024-05-01  22222       AGC          KYN   3A      4   
137730   2024-01-03   2024-05-01  22222       VGLJ         KYN   2A      1   

        weekday  season  is_peak_day  
137726        2       1            0  
137727        2       1            0  
137728        2       1            0  
137729        2       1            0  
137730        2       1            0  
