# Exploratory Data Analysis

Analysis of Boston 311 Service Request data (2019-2024)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

df = pd.read_parquet('../data/raw/311_2019_2024.parquet')
print(f"Dataset shape: {df.shape}")
df.head()

## Data Overview

In [None]:
print("Data Types:")
print(df.dtypes)
print("\nMemory Usage:")
print(df.memory_usage(deep=True) / 1024**2, "MB")

In [None]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)
print(missing_df[missing_df['Missing Count'] > 0])

Seems to be a lot of missing data in terms of neighborhoods. It is best to filter those rows out during preprocessing.

## Temporal Analysis

In [None]:
df['open_dt'] = pd.to_datetime(df['open_dt'])
df['date'] = df['open_dt'].dt.date
df['month'] = df['open_dt'].dt.to_period('M')
df['year_month'] = df['open_dt'].dt.to_period('M')
df['day_of_week'] = df['open_dt'].dt.day_name()
df['hour'] = df['open_dt'].dt.hour

print(f"Date range: {df['open_dt'].min()} to {df['open_dt'].max()}")

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
year_counts = df['year'].value_counts().sort_index()
year_counts.plot(kind='bar', ax=ax, color='steelblue')
ax.set_title('311 Service Requests by Year', fontsize=14, fontweight='bold')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Requests')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nRequests per year:")
print(year_counts)

In [None]:
monthly_counts = df.groupby('year_month').size()
fig, ax = plt.subplots(figsize=(14, 5))
monthly_counts.plot(ax=ax, linewidth=2, color='darkblue')
ax.set_title('Monthly 311 Request Trends (2019-2024)', fontsize=14, fontweight='bold')
ax.set_xlabel('Month')
ax.set_ylabel('Number of Requests')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_counts = df['day_of_week'].value_counts().reindex(dow_order)

fig, ax = plt.subplots(figsize=(10, 5))
dow_counts.plot(kind='bar', ax=ax, color='teal')
ax.set_title('311 Requests by Day of Week', fontsize=14, fontweight='bold')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
hourly_counts = df['hour'].value_counts().sort_index()

fig, ax = plt.subplots(figsize=(12, 5))
hourly_counts.plot(kind='bar', ax=ax, color='coral')
ax.set_title('311 Requests by Hour of Day', fontsize=14, fontweight='bold')
ax.set_xlabel('Hour')
ax.set_ylabel('Number of Requests')
plt.tight_layout()
plt.show()

## Request Type Analysis

In [None]:
top_types = df['type'].value_counts().head(15)

fig, ax = plt.subplots(figsize=(12, 6))
top_types.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Top 15 Request Types', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Requests')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 15 request types:")
print(top_types)

In [None]:
print("Unique subjects:", df['subject'].nunique())
print("\nTop 10 subjects:")
print(df['subject'].value_counts().head(10))

In [None]:
print("Unique reasons:", df['reason'].nunique())
print("\nTop 10 reasons:")
print(df['reason'].value_counts().head(10))

## Geographic Analysis

In [None]:
print("Unique neighborhoods:", df['neighborhood'].nunique())
top_neighborhoods = df['neighborhood'].value_counts().head(20)

fig, ax = plt.subplots(figsize=(12, 7))
top_neighborhoods.plot(kind='barh', ax=ax, color='darkgreen')
ax.set_title('Top 20 Neighborhoods by Request Volume', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Requests')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 20 neighborhoods:")
print(top_neighborhoods)

In [None]:
df_with_coords = df[df['latitude'].notna() & df['longitude'].notna()].copy()
df_with_coords['latitude'] = pd.to_numeric(df_with_coords['latitude'], errors='coerce')
df_with_coords['longitude'] = pd.to_numeric(df_with_coords['longitude'], errors='coerce')

print(f"Records with valid coordinates: {len(df_with_coords):,}")
print(f"Percentage with coordinates: {len(df_with_coords)/len(df)*100:.2f}%")
print(f"\nLatitude range: {df_with_coords['latitude'].min():.4f} to {df_with_coords['latitude'].max():.4f}")
print(f"Longitude range: {df_with_coords['longitude'].min():.4f} to {df_with_coords['longitude'].max():.4f}")

In [None]:
sample_size = min(10000, len(df_with_coords))
df_sample = df_with_coords.sample(sample_size, random_state=42)

fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter(df_sample['longitude'], df_sample['latitude'], 
           alpha=0.1, s=1, color='red')
ax.set_title(f'Geographic Distribution of 311 Requests\n(Sample of {sample_size:,} requests)', 
             fontsize=14, fontweight='bold')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_aspect('equal')
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print("Total requests", len(df))
print("Date range", df['open_dt'].min(), "to", df['open_dt'].max())
print("Number of days:", (df['open_dt'].max() - df['open_dt'].min()).days)
print("Average requests per day:", len(df) / (df['open_dt'].max() - df['open_dt'].min()).days)
print("\nUnique request types:", df['type'].nunique())
print("Unique subjects:", df['subject'].nunique())
print("Unique reasons:", df['reason'].nunique())
print("Unique neighborhoods:", df['neighborhood'].nunique())
print("\nRecords with coordinates:", len(df_with_coords), "(", len(df_with_coords)/len(df)*100, "%)")
print("=" * 60)