# Exploratory Data Analysis
Analysis of sensor correlation and pollution trends to validate feature selection for the anomaly detection model.

In [None]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Connect to the database created by ingest.py
conn = sqlite3.connect('../data/pollution.db')
df = pd.read_sql("SELECT * FROM sensors", conn)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

print("Data loaded successfully!")
print(df.head())

In [None]:
plt.figure(figsize=(10, 8))
# Calculate correlation matrix
corr = df.corr()
# Plot heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Environmental Factors')
# Save the image so README can see it
plt.savefig('../correlation_heatmap.png') 
plt.show()

In [None]:
# Resample to weekly averages to make the graph readable
weekly_data = df['PM2.5'].resample('W').mean()

plt.figure(figsize=(12, 6))
plt.plot(weekly_data, label='Weekly Avg PM2.5', color='orange')
plt.title('PM2.5 Pollution Trends (2013-2017)')
plt.ylabel('Concentration (ug/m3)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('../timeseries.png') 
plt.show()