In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
df=pd.read_csv('hospital_data_cleaned.csv')

  df=pd.read_csv('hospital_data_cleaned.csv')


In [39]:
import pandas as pd

# ==============================
# STEP 1: Load Data
# ==============================

df = pd.read_csv('hospital_data_cleaned.csv')
df['ResidentDate'] = pd.to_datetime(df['ResidentDate'])

# ==============================
# STEP 2: Basic Feature Engineering
# ==============================

# Time-based features
df['ResidentDate_hour'] = df['ResidentDate'].dt.hour
df['ResidentDate_weekday'] = df['ResidentDate'].dt.weekday
df['ResidentDate_month'] = df['ResidentDate'].dt.month
df['ResidentDate_year'] = df['ResidentDate'].dt.year
df['is_weekend'] = df['ResidentDate_weekday'].isin([5, 6]).astype(int)
df['season'] = df['ResidentDate_month'] % 12 // 3 + 1

# Length of stay (if not already present)

# Weather-based features
df['temp_range'] = df['temperature_max'] - df['temperature_min']
df['is_rain'] = (df['precipitation'] > 0).astype(int)

# Event and external impact features
df['event_flag'] = (df['event_type'] != "No Event").astype(int)
df['disaster_flag'] = (df['disaster'] != "None").astype(int)
df['pandemic_flag'] = (df['pandemic'] != "None").astype(int)

# Demographics
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 55, 75, 120], 
                                    labels=['Child', 'Young Adult', 'Adult', 'Senior', 'Elderly'])
df['gender_encoded'] = df['gender'].map({'Male': 1, 'Female': 0})
df['marital_Status'] = df['marital_Status'].fillna(0)

# Discharge and referral info
df['discharged'] = (df['StatusOnDischarge'].notnull()).astype(int)

# Referral type (one-hot encoding)
df = pd.get_dummies(df, columns=['KindRef'], prefix='ref', drop_first=True)

# ==============================
# STEP 3: Create Lag/Rolling Features
# ==============================

# Aggregate daily ER admissions using DischargeFromED
daily_admissions = df.groupby('ResidentDate')['DischargeFromED'].sum().reset_index()
daily_admissions = daily_admissions.sort_values('ResidentDate')

# Lag and rolling statistics
daily_admissions['ER_Admissions_Lag1'] = daily_admissions['DischargeFromED'].shift(1)
daily_admissions['ER_Admissions_Lag7'] = daily_admissions['DischargeFromED'].shift(7)
daily_admissions['ER_Admissions_MA7'] = daily_admissions['DischargeFromED'].rolling(window=7).mean()
daily_admissions['ER_Admissions_MA14'] = daily_admissions['DischargeFromED'].rolling(window=14).mean()
daily_admissions['ER_Admissions_EMA'] = daily_admissions['DischargeFromED'].ewm(span=7, adjust=False).mean()
daily_admissions['ER_Admissions_Rolling_Std'] = daily_admissions['DischargeFromED'].rolling(window=7).std()
daily_admissions['ER_Admissions_Rolling_Min'] = daily_admissions['DischargeFromED'].rolling(window=7).min()
daily_admissions['ER_Admissions_Rolling_Max'] = daily_admissions['DischargeFromED'].rolling(window=7).max()

# Forward fill for NaNs due to shifting and rolling
daily_admissions.fillna(method='ffill', inplace=True)

# ==============================
# STEP 4: Merge Time Series Features into Main DataFrame
# ==============================

df = df.merge(
    daily_admissions[['ResidentDate', 
                      'ER_Admissions_Lag1', 
                      'ER_Admissions_Lag7', 
                      'ER_Admissions_MA7', 
                      'ER_Admissions_MA14', 
                      'ER_Admissions_EMA', 
                      'ER_Admissions_Rolling_Std',
                      'ER_Admissions_Rolling_Min', 
                      'ER_Admissions_Rolling_Max']],
    on='ResidentDate',
    how='left'
)

# Backfill any remaining NaNs
df.fillna(method='bfill', inplace=True)

# ==============================
# STEP 5: Save to Single Output File
# ==============================


columns_to_keep = [
    "DischargeFromED",
    "ResidentDate", "ResidentDate_year", "ResidentDate_month", "ResidentDate_day",
    "ResidentDate_weekday",
    "temperature_max", "temperature_min", "precipitation", "temp_range", "season",
    "event_type", "pandemic", "disaster",
    "is_weekend", "is_rain",
    "ER_Admissions_Lag1", "ER_Admissions_Lag7",
    "ER_Admissions_MA7", "ER_Admissions_MA14", "ER_Admissions_EMA",
    "ER_Admissions_Rolling_Std", "ER_Admissions_Rolling_Min", "ER_Admissions_Rolling_Max"
]



df_time_series = df[columns_to_keep]

# (Optional) Save the cleaned dataset to a new CSV
df_time_series.to_csv("feature_engineering_data.csv", index=False)


  df = pd.read_csv('hospital_data_cleaned.csv')
  daily_admissions.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)


In [40]:
print(df_time_series.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143280 entries, 0 to 143279
Data columns (total 24 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   DischargeFromED            143280 non-null  int64         
 1   ResidentDate               143280 non-null  datetime64[ns]
 2   ResidentDate_year          143280 non-null  int32         
 3   ResidentDate_month         143280 non-null  int32         
 4   ResidentDate_day           143280 non-null  int64         
 5   ResidentDate_weekday       143280 non-null  int32         
 6   temperature_max            143280 non-null  float64       
 7   temperature_min            143280 non-null  float64       
 8   precipitation              143280 non-null  float64       
 9   temp_range                 143280 non-null  float64       
 10  season                     143280 non-null  int32         
 11  event_type                 143280 non-null  object  

In [41]:
df_time_series.nunique()

DischargeFromED                 2
ResidentDate                 1819
ResidentDate_year               6
ResidentDate_month             12
ResidentDate_day               31
ResidentDate_weekday            7
temperature_max               147
temperature_min               118
precipitation                 222
temp_range                    201
season                          4
event_type                      3
pandemic                        1
disaster                        3
is_weekend                      2
is_rain                         2
ER_Admissions_Lag1             99
ER_Admissions_Lag7             99
ER_Admissions_MA7             472
ER_Admissions_MA14            702
ER_Admissions_EMA            1819
ER_Admissions_Rolling_Std    1736
ER_Admissions_Rolling_Min      72
ER_Admissions_Rolling_Max      88
dtype: int64