# 3.0 **Installation & Setup**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic

# Load cleaned dataset from previous step
df = pd.read_csv("./data/cleaned_reports.csv", parse_dates=["reported_datetime", "closed_datetime"])

**Encode Time Features**

In [2]:
# Extract time encodings
df["hour"] = df["reported_datetime"].dt.hour
df["day_of_week"] = df["reported_datetime"].dt.dayofweek
df["month"] = df["reported_datetime"].dt.month

# Cyclical encoding for models like XGBoost or NN
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["dow_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
df["dow_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)


**Engineer Time-To-Resolve**

In [3]:
if "resolved_datetime" in df.columns:
    df["resolved_datetime"] = pd.to_datetime(df["resolved_datetime"], errors="coerce")
    df["time_to_resolve_hours"] = (df["resolved_datetime"] - df["reported_datetime"]).dt.total_seconds() / 3600


**POI Feature Aggregation – Density & Proximity**

In [26]:
# Check sample of POI columns
poi_cols = [col for col in df.columns if "dist_" in col or "count_" in col]

# Normalize POI density features
scaler = StandardScaler()
df[poi_cols] = scaler.fit_transform(df[poi_cols])


**Weather & Air Quality Feature Engineering**

In [11]:
# Identify environmental columns
env_cols = [
    "temp_c", "humidity", "wind_kph", "precip_mm",
    "pm10", "pm2_5", "co", "no2", "o3", "so2"
]

# Fill missing values with mean (or use interpolation depending on usage)
df[env_cols] = df[env_cols].fillna(df[env_cols].mean())

# Optional: Standardize
df[env_cols] = scaler.fit_transform(df[env_cols])


**Socio-Demographics Enrichment**

In [13]:
# Normalize socio-demographic columns
demo_cols = ["median_income", "total_population", "average_age"]  # Adjust names accordingly
df[demo_cols] = scaler.fit_transform(df[demo_cols])

In [15]:
df.columns

Index(['id', 'status', 'reported_datetime', 'closed_datetime', 'issue_type',
       'latitude', 'longitude', 'city', 'country', 'temp_c', 'humidity',
       'wind_kph', 'precip_mm', 'pm10', 'pm2_5', 'co', 'no2', 'o3', 'so2',
       'commercial_count_within_200m', 'dist_to_nearest_commericial',
       'recreation_count_within_200m', 'dist_to_nearest_recreation',
       'facilities_count_within_200m', 'dist_to_nearest_facility',
       'transit_count_within_200m', 'dist_to_nearest_transit', 'boundary_name',
       'area_km2', 'median_income', 'total_population', 'average_age',
       'is_public_holiday', 'issue_type_sg', 'hour', 'day_of_week', 'month',
       'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos'],
      dtype='object')

**Final Feature Selection**

In [35]:
# Step 1: Add date column
df["date"] = df["reported_datetime"].dt.date

# Step 2: Define grouping key
group_cols = ["date", "boundary_name"]

# Step 3: Aggregate feature columns (mean per boundary per day)
feature_cols = (
    ["day_of_week", "dow_sin", "dow_cos", "month", "is_public_holiday"]
    + env_cols + poi_cols + demo_cols
)
df_features = df.groupby(group_cols)[feature_cols].mean().reset_index()

# Step 4: Count issue types (pivot)
df_targets = df.groupby(group_cols + ["issue_type_sg"]).size().unstack(fill_value=0).reset_index()

# Step 5: Merge features + targets on date + boundary
df_merged = pd.merge(df_features, df_targets, on=group_cols)

# Step 6: (Optional) Rename for clarity
df_merged = df_merged.rename(columns={"boundary_name": "boundary"})


In [36]:
issue_type_columns = df["issue_type_sg"].unique().tolist()
issue_type_columns.sort()

features_X = df_merged.drop(columns=issue_type_columns)  # All except issue type counts
labels_y = df_merged[issue_type_columns]                 # Only issue count columns

In [None]:
features_X.to_csv("./data/features_X.csv", index=False)
labels_y.to_csv("./data/labels_y.csv", index=False)

: 

Summary 

### Feature Engineering Summary

- **Time-based features**: hour, day of week, cyclical time, public holiday
- **Environmental**: temperature, humidity, air pollutants, wind, precipitation
- **POIs**: normalized density/proximity by type
- **Socio-demographic**: income, age, population
- **Optional**: time to resolve (if resolution timestamp exists)

Next step: train and evaluate forecasting model using `features_X.csv` and `labels_y.csv`.