Setup and Load Cleaned Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from datetime import datetime

# Settings
pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")

# Load preprocessed data
df = pd.read_csv("./data/cleaned_reports.csv", parse_dates=["reported_datetime"])

Quick Dataset Overview

In [None]:
# Shape and summary
print("Shape of dataset:", df.shape)
print("\nData types:")
print(df.dtypes)

# Null summary
print("\nMissing values:")
print(df.isnull().sum())


Top Issue Types (Original and SG Categories)

In [None]:
# Top raw issue types
print("\nTop 20 Original Issue Types:")
print(df["issue_type"].value_counts().head(20))

# Top mapped SG issue types
print("\nMapped SG Issue Types:")
print(df["issue_type_sg"].value_counts())

# Bar chart for SG issue type distribution
plt.figure(figsize=(10, 5))
sns.countplot(data=df, y="issue_type_sg", order=df["issue_type_sg"].value_counts().index)
plt.title("Distribution of Issues by Singapore Categories")
plt.xlabel("Count")
plt.ylabel("SG Issue Type")
plt.tight_layout()
plt.show()


Trend Over Time (All and by Category)

In [None]:
# Create daily counts
df["date"] = df["reported_datetime"].dt.date
daily_counts = df.groupby("date").size()

# Plot overall trend
plt.figure(figsize=(12, 4))
daily_counts.plot()
plt.title("Daily Issue Reports Over Time")
plt.xlabel("Date")
plt.ylabel("Report Count")
plt.tight_layout()
plt.show()

# Trend by SG category (optional)
df["month"] = df["reported_datetime"].dt.to_period("M")
monthly_sg = df.groupby(["month", "issue_type_sg"]).size().unstack().fillna(0)

# Plot top 5 categories
top5 = df["issue_type_sg"].value_counts().nlargest(5).index
monthly_sg[top5].plot(figsize=(12, 6))
plt.title("Monthly Trends of Top 5 SG Issue Categories")
plt.xlabel("Month")
plt.ylabel("Reports")
plt.legend(title="SG Category")
plt.tight_layout()
plt.show()


Spatial Distribution

In [None]:
# Plot simple scatterplot (optional, full map may be in the dashboard stage)
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df.sample(1000), x="longitude", y="latitude", hue="source_city", alpha=0.5)
plt.title("Spatial Distribution of Sampled Reports")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend()
plt.tight_layout()
plt.show()


Correlation with External Features

In [None]:
# Check correlation with numerical features (e.g., weather, AQ, POI)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr = df[num_cols].corr()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap="coolwarm", center=0, annot=False)
plt.title("Correlation Heatmap of Numerical Features")
plt.tight_layout()
plt.show()


Effect of Time Factors

In [None]:
# Day of Week Distribution
sns.countplot(data=df, x="day_of_week", order=list(range(7)))
plt.title("Reports by Day of the Week (0=Monday)")
plt.xlabel("Day of Week")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Public Holiday Effect
sns.countplot(data=df, x="is_public_holiday")
plt.title("Reports on Public Holidays vs Non-Holidays")
plt.xlabel("Is Public Holiday?")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


Summary Insights

### Summary Insights

- The dataset includes reports from three U.S. cities, with most issues falling into SG categories such as "Cleanliness", "Roads & Footprints", and "Animals & Bird".
- There are clear spikes in report counts around specific dates, possibly tied to events or weather.
- Public holidays appear to slightly reduce or change the pattern of reporting.
- Some external features like PM2.5, precipitation, and POI density show correlations worth exploring further.
- Data appears spatially clustered according to city, and there are distinct temporal and categorical patterns.

Next step: Feature engineering based on these insights.