# 3. Exploratory Analysis with the NAPS data

We will explorer the NAPS data we downloaded. We will use BURNABY SOUTH data.

In [None]:
import numpy as np
import pandas as pd
import sys
from pathlib import Path
import matplotlib.pyplot as plt

# set project root
sys.path.insert(0, str(Path.cwd().parent))

from src.config import *

In [None]:
# fixed for this tutorial
site_id = 100119


## 3.1. Choose a File to Explore

We’ve saved cleaned measurement data for multiple years and types.

In this section, you can choose any of the processed CSV files and explore its contents—for example, to see time series patterns, detect missing values, or compare analytes.

The code below lists all available files in the `data/processed/` directory.


In [None]:
csv_files = sorted(PROCESSED_DATA_DIR.glob("*.csv"))

# display the list of all CSV files
print("Available files:")
for i, file in enumerate(csv_files):
    print(f"[{i}] {file.name}")
    

In [None]:
year = 2023
key = 'nt'


In [None]:
csv_path = PROCESSED_DATA_DIR / f"{year}_{site_id}_{key}.csv"
df = pd.read_csv(
    csv_path,
    index_col="sampling_date",  # column header you wrote out
    parse_dates=True,           # turn it into pandas-datetime
)
display(df.head(3))


## 3.2. Example: Plot a Time Series of Raw Data

To help you get started with exploring the data, here’s a sample plot showing how **Selenium (Se)** concentrations and their detection limits (Se-MDL) vary over time.

This example:

- Replaces `-999` with `NaN` so invalid values aren’t plotted
- Separates routine samples (`R`) and field blanks (`FB`) using the `sampling_type` column
- Plots Se and Se-MDL values with different markers for each sampling type

You can adapt this code to explore other analytes, compare years, or investigate missing data patterns.


In [None]:
analyte = "Se"
analyte_mdl = analyte + '-MDL'

# replace -999 with NaN so they won’t be plotted
df_plot = df.replace(-999, np.nan)

# define masks
routine = df_plot['sampling_type'] == 'R'
blank   = df_plot['sampling_type'] == 'TB'

# plot
plt.figure(figsize=(10, 5))

# routine analyte and its MDL
plt.plot(df_plot.index[routine], df_plot[analyte][routine], 
         'o', label=f'{analyte} (Routine)')
plt.plot(df_plot.index[routine], df_plot[analyte_mdl][routine], 
         'o', markerfacecolor='none', label=f'{analyte}-MDL (Routine)')

# Field Blank analyte and its MDL
plt.plot(df_plot.index[blank], df_plot[analyte][blank],
         's', label=f'{analyte} (Travel Blank)')
plt.plot(df_plot.index[blank], df_plot[analyte_mdl][blank],
         's', markerfacecolor='none', label=f'{analyte}-MDL (Travel Blank)')

# labels etc.
plt.xlabel('Date')
plt.ylabel(f'{analyte} Concentration (ng/m$^3$)')
plt.title(f'{analyte} and Detection Limit by Sampling Type')
plt.legend()
plt.tight_layout()
plt.show()


## 3.3. Example: Time Series with Rolling Mean

Let’s plot a time series of an analyte (e.g., Selenium) along with a **centered rolling mean**, which helps reveal broader trends by smoothing out daily fluctuations.

In this example:

- We remove invalid values (`-999`)
- Plot both the raw concentration and a 7-point rolling average
- Use only routine samples (`sampling_type == 'R'`)


In [None]:
analyte = "Se"
window_size = 40

# Replace -999 with NaN
df_plot = df.replace(-999, np.nan)

# Filter for routine samples only
routine = df_plot["sampling_type"] == "R"
df_routine = df_plot[routine]

# Resample to daily frequency (important if data is not daily)
df_daily = df_routine[[analyte]].resample("D").mean()

# Calculate 30-day centered rolling mean, allowing minimum 1 value
rolling_mean = df_daily[analyte].rolling(window=window_size, center=True, min_periods=1).mean()

# Plot
plt.figure(figsize=(12, 5))
plt.plot(df_daily.index, df_daily[analyte], 'o', alpha=0.5, label=f'{analyte} (Daily Mean)')
plt.plot(df_daily.index, rolling_mean, '-', linewidth=2, label=f'{analyte} ({window_size}-day Rolling Avg)')

plt.xlabel('Date')
plt.ylabel(f'{analyte} Concentration (ng/m$^3$)')
plt.title(f'{analyte}: {window_size}-Day Centered Rolling Mean')
plt.legend()
plt.tight_layout()
plt.show()
