# 01 - Data Exploration

This notebook explores the NYC rat sighting data and related datasets.

## Contents
1. Data Loading
2. Basic Statistics
3. Temporal Patterns
4. Spatial Distribution
5. Feature Analysis

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add project root to path
sys.path.insert(0, str(Path.cwd().parent))

from src import config

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

%matplotlib inline

## 1. Data Loading

In [None]:
# Load the master dataset
df = pd.read_csv(config.PROCESSED_DATA_DIR / 'master_dataset.csv', parse_dates=['date'])
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic info
df.info()

In [None]:
# Summary statistics
df.describe()

## 2. Basic Statistics

In [None]:
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Number of ZIP codes: {df['zip_code'].nunique()}")
print(f"Total complaints: {df['complaint_count'].sum():,}")
print(f"Average complaints per month per ZIP: {df['complaint_count'].mean():.2f}")

In [None]:
# Complaints by borough
if 'borough' in df.columns:
    borough_counts = df.groupby('borough')['complaint_count'].sum().sort_values(ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    borough_counts.plot(kind='bar', ax=ax, color=sns.color_palette('husl', len(borough_counts)))
    ax.set_title('Total Rat Complaints by Borough', fontsize=14)
    ax.set_xlabel('Borough')
    ax.set_ylabel('Total Complaints')
    ax.tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.show()

## 3. Temporal Patterns

In [None]:
# Monthly trends
monthly = df.groupby('date')['complaint_count'].sum().reset_index()

fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(monthly['date'], monthly['complaint_count'], linewidth=1.5)
ax.set_title('Monthly Rat Complaints Over Time', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Complaints')
plt.tight_layout()
plt.show()

In [None]:
# Seasonal pattern
df['month'] = df['date'].dt.month
seasonal = df.groupby('month')['complaint_count'].mean()

fig, ax = plt.subplots(figsize=(10, 5))
seasonal.plot(kind='bar', ax=ax, color=plt.cm.YlOrRd(np.linspace(0.3, 0.9, 12)))
ax.set_title('Average Complaints by Month (Seasonal Pattern)', fontsize=14)
ax.set_xlabel('Month')
ax.set_ylabel('Average Complaints')
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Year-over-year comparison
df['year'] = df['date'].dt.year
yearly = df.groupby('year')['complaint_count'].sum()

fig, ax = plt.subplots(figsize=(10, 5))
yearly.plot(kind='bar', ax=ax)
ax.set_title('Annual Rat Complaints', fontsize=14)
ax.set_xlabel('Year')
ax.set_ylabel('Total Complaints')
plt.tight_layout()
plt.show()

## 4. Spatial Distribution

In [None]:
# Top ZIP codes by complaints
top_zips = df.groupby('zip_code')['complaint_count'].sum().nlargest(20)

fig, ax = plt.subplots(figsize=(12, 6))
top_zips.plot(kind='barh', ax=ax)
ax.set_title('Top 20 ZIP Codes by Rat Complaints', fontsize=14)
ax.set_xlabel('Total Complaints')
ax.set_ylabel('ZIP Code')
plt.tight_layout()
plt.show()

In [None]:
# Complaint distribution
fig, ax = plt.subplots(figsize=(10, 5))
df['complaint_count'].hist(bins=50, ax=ax, edgecolor='black')
ax.set_title('Distribution of Monthly Complaints per ZIP', fontsize=14)
ax.set_xlabel('Complaints')
ax.set_ylabel('Frequency')
ax.axvline(df['complaint_count'].mean(), color='red', linestyle='--', label=f'Mean: {df["complaint_count"].mean():.1f}')
ax.legend()
plt.tight_layout()
plt.show()

## 5. Feature Analysis

In [None]:
# Correlation with other features
numeric_cols = ['complaint_count', 'restaurant_violations_nearby', 'building_age_mean', 'old_building_pct']
numeric_cols = [c for c in numeric_cols if c in df.columns]

if len(numeric_cols) > 1:
    corr_matrix = df[numeric_cols].corr()
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=ax)
    ax.set_title('Feature Correlations', fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
# Restaurant violations vs complaints
if 'restaurant_violations_nearby' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.scatter(df['restaurant_violations_nearby'], df['complaint_count'], alpha=0.3)
    ax.set_title('Restaurant Violations vs Rat Complaints', fontsize=14)
    ax.set_xlabel('Restaurant Violations Nearby')
    ax.set_ylabel('Rat Complaints')
    plt.tight_layout()
    plt.show()

## Key Findings

1. **Temporal Patterns**: Clear seasonal pattern with peak complaints in summer months (June-August)
2. **Spatial Distribution**: Complaints concentrated in specific neighborhoods
3. **Correlations**: Restaurant violations and building age show correlation with rat complaints