# NCEI comparison with PAG-ASA data

This notebook focuses on the validation of `NCEI` against `PAG-ASA` station data for the **Temperature variables**: `tmean`, `tmax`, `tmin`.

In [1]:
%load_ext autoreload
%autoreload 2

## Set-up and Imports

In [2]:
import geopandas as gpd
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
import folium

import json
from loguru import logger
from pathlib import Path

from geowrangler.datasets import geofabrik

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import sys

sys.path.append("../../../")  # include parent directory
from src.vector_utils import *



In [3]:
# data directories
DATA_DIR = Path("../../../data/")
SRC_DIR = Path("../../../src/")
ADMIN_FPATH = DATA_DIR / "01-admin-bounds"
RAW_FPATH = DATA_DIR / "02-raw"
PROCESSED_FPATH = DATA_DIR / "03-processed"
OUTPUT_FPATH = DATA_DIR / "04-output"
GIS_FPATH = DATA_DIR / "05-gis"

### Load Admin Bounds 

In [4]:
# Load the administrative boundaries for the 12 cities
admin_bounds_gdf = gpd.read_file(ADMIN_FPATH / "renamed_target_admin_bounds.gpkg")

### Load NCEI

In [5]:
ncei_tmean_df = pd.read_csv(RAW_FPATH / "ncei" / "NCEI_tmean_all_stns.csv")
ncei_tmean_df = ncei_tmean_df.drop(columns=["Unnamed: 0"])
ncei_tmean_df.head(2)

Unnamed: 0,DATE,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
0,2002-01-01,,,27.4,26.8,,,,25.9,,,24.8,
1,2002-01-02,,,27.9,,,,,26.1,,25.8,,


In [6]:
ncei_tmin_df = pd.read_csv(RAW_FPATH / "ncei" / "NCEI_tmin_all_stns.csv")
ncei_tmin_df = ncei_tmin_df.drop(columns=["Unnamed: 0"])
ncei_tmin_df.head(2)

Unnamed: 0,DATE,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
0,2002-01-01,,,24.0,23.0,,,,23.0,,,24.0,
1,2002-01-02,,,24.0,,,,,23.0,,23.0,,


In [7]:
ncei_tmax_df = pd.read_csv(RAW_FPATH / "ncei" / "NCEI_tmax_all_stns.csv")
ncei_tmax_df = ncei_tmax_df.drop(columns=["Unnamed: 0"])
ncei_tmax_df.head(2)

Unnamed: 0,DATE,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
0,2002-01-01,,,31.0,30.1,,,,29.0,,,26.2,
1,2002-01-02,,,31.8,,,,,29.0,,30.3,,


In [8]:
ncei_wind_df = pd.read_csv(RAW_FPATH / "ncei" / "NCEI_wind_speed_all_stns.csv")
ncei_wind_df = ncei_wind_df.drop(columns=["Unnamed: 0"])
ncei_wind_df.head(2)

Unnamed: 0,DATE,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
0,2002-01-01,,,2.5,2.7,,,,2.6,,,0.8,
1,2002-01-02,,,3.2,,,,,2.2,,1.2,,


### Load PAG-ASA Station Data

In [9]:
stations_list = [
    "Cabanatuan",
    "Dagupan",
    "Davao City",
    "Dumangas",
    "Legazpi",
    "Lumbia-El Salvador",
    "Mactan",
    "NAIA",
    "Port Area",
    "Science Garden",
    "Tacloban",
    "Zamboanga",
]


def load_pagasa_station_data(stations):
    pagasa_dfs = {}
    for station in stations:
        df = pd.read_csv(RAW_FPATH / "pagasa" / f"{station} Daily Data.csv")
        pagasa_dfs[station] = df
    return pagasa_dfs

In [10]:
PAGASA_DFS = load_pagasa_station_data(stations_list)

In [11]:
PAGASA_DFS["Tacloban"]

Unnamed: 0,YEAR,MONTH,DAY,RAINFALL,TMAX,TMIN,TMEAN,RH,WIND_SPEED,WIND_DIRECTION
0,2008.0,7.0,1.0,0.2,30.4,25.4,27.90,87.0,1.0,320.0
1,2008.0,7.0,2.0,4.4,31.6,25.2,28.40,92.0,1.0,320.0
2,2008.0,7.0,3.0,3.6,31.0,24.3,27.70,88.0,1.0,160.0
3,2008.0,7.0,4.0,0.0,32.1,25.8,29.00,84.0,1.0,140.0
4,2008.0,7.0,5.0,0.4,32.6,25.8,29.20,84.0,1.0,110.0
...,...,...,...,...,...,...,...,...,...,...
364,2018.0,12.0,27.0,15.2,29.0,22.6,25.80,82.0,2.0,290.0
365,2018.0,12.0,28.0,99.2,26.0,24.0,25.00,96.0,2.0,290.0
366,2018.0,12.0,29.0,0.4,28.8,25.2,27.00,92.0,2.0,270.0
367,2018.0,12.0,30.0,48.8,31.0,24.5,27.75,87.0,2.0,340.0


### Utils

In [12]:
def plot_histograms(dfs_to_compare, fig_title):
    fig, ax = plt.subplots(figsize=(6, 4))
    for a in dfs_to_compare:
        sns.histplot(
            data=a,
            bins=20,
            ax=ax,
            kde=True,
            multiple="layer",
            legend=True,
        )
    fig.suptitle(fig_title)
    fig.legend(labels=["NCEI", "PAG-ASA"])
    fig.tight_layout()


def plot_distributions(year, ncei_df, station_name, climate_var):
    # filter ncei
    ncei_df = ncei_df.copy()
    filter_year_ncei = ncei_df[
        (ncei_df["DATE"].dt.year == year) & (ncei_df["DATE"].dt.month >= 7)
    ]

    ncei_col = filter_year_ncei.filter(like=station_name.split()[0]).columns

    # filter pagasa to a station-year
    pagasa_df = PAGASA_DFS[station_name]
    filter_year_pagasa_df = pagasa_df[pagasa_df["YEAR"] == year]

    dfs_to_compare = [
        filter_year_ncei[ncei_col],
        filter_year_pagasa_df[climate_var],
    ]

    plot_histograms(dfs_to_compare, f"{station_name} {climate_var} Distribution {year}")


def get_distribution_plots(ncei_df, year=2018, climate_var="TMEAN"):
    for i in range(len(stations_list)):
        try:
            plot_distributions(year, ncei_df, stations_list[i], climate_var)
        except Exception as error:
            logger.exception(f"{error} Occurred at {stations_list[i]}")

In [13]:
def get_error_metrics(year, ncei_df, station_name, climate_var):
    # filter ncei
    ncei_df = ncei_df.copy()
    filter_year_ncei = ncei_df[
        (ncei_df["DATE"].dt.year == year) & (ncei_df["DATE"].dt.month >= 7)
    ]
    ncei_col = ncei_df.filter(like=station_name.split()[0]).columns
    ncei_colname = ncei_col.tolist()[0]
    filter_year_ncei = filter_year_ncei[["DATE", ncei_colname]]
    filter_year_ncei = filter_year_ncei.rename(
        columns={ncei_colname: f"ncei_{climate_var}"}
    )

    # filter pagasa to a station-year
    pagasa_df = PAGASA_DFS[station_name].copy()
    filter_year_pagasa_df = pagasa_df[pagasa_df["YEAR"] == year]
    filter_year_pagasa_df = filter_year_pagasa_df[["DATE", climate_var]]
    # join to one table
    joined_table = filter_year_pagasa_df.merge(filter_year_ncei, how="inner", on="DATE")
    clean_joined = joined_table.dropna(subset=[climate_var, f"ncei_{climate_var}"])

    # prep for error calculations
    y_true = clean_joined[climate_var].tolist()  # pagasa
    y_pred = clean_joined[f"ncei_{climate_var}"].tolist()  # ncei

    mape = mean_absolute_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    return mape, mae, rmse


def get_error_df(year, ncei_df):
    error_dict = {}

    for station in stations_list:
        try:
            error_dict[station] = get_error_metrics(year, ncei_df, station, "TMEAN")
        except Exception as error:
            logger.exception(f"{error} Occurred at {station}")

    errors_df = pd.DataFrame.from_dict(
        error_dict, orient="index", columns=["MAPE", "MAE", "RMSE"]
    )
    return errors_df

## Quick Sense-checks for NCEI

In [26]:
# tmean
ncei_tmean_df["DATE"].min(), ncei_tmean_df["DATE"].max()

('2002-01-01', '2022-12-31')

In [32]:
ncei_tmean_df.shape[0]

7666

In [27]:
ncei_tmean_df.describe()

Unnamed: 0,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
count,6071.0,7616.0,7666.0,3302.0,7606.0,4379.0,7634.0,7662.0,7548.0,7242.0,5498.0,7644.0
mean,27.442201,27.71875,28.236721,27.714809,27.671404,26.451701,27.936167,28.169473,28.480644,27.497804,27.585758,28.377852
std,1.636487,1.451777,1.120865,1.353974,1.366773,1.110512,1.180125,1.551549,1.492267,1.62538,1.291821,1.051649
min,21.4,22.1,22.4,23.2,22.5,22.3,22.8,21.6,21.2,20.2,22.1,24.3
25%,26.4,26.7,27.6,26.8,26.8,25.7,27.2,27.1,27.475,26.4,26.8,27.8
50%,27.4,27.8,28.4,27.7,27.7,26.4,28.0,28.1,28.5,27.5,27.7,28.5
75%,28.45,28.7,29.0,28.6,28.6,27.2,28.8,29.2,29.5,28.6,28.5,29.1
max,32.6,32.5,32.0,32.2,31.5,30.2,31.4,32.9,33.4,32.8,31.7,31.6


In [20]:
# tmin
ncei_tmin_df["DATE"].min(), ncei_tmin_df["DATE"].max()

('2002-01-01', '2022-12-31')

In [31]:
ncei_tmin_df.shape[0]

7666

In [28]:
ncei_tmin_df.describe()

Unnamed: 0,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
count,6071.0,7616.0,7666.0,3302.0,7605.0,4379.0,7634.0,7662.0,7548.0,7242.0,5498.0,7644.0
mean,23.344721,23.542529,24.208114,23.602059,24.474675,22.469765,24.779159,24.809528,25.343071,23.590155,24.716242,23.803519
std,1.700842,1.841956,0.889709,1.076133,1.487545,1.114658,1.074346,1.486176,1.460776,1.750858,1.118503,1.110211
min,10.0,9.8,12.4,14.0,15.0,12.7,15.6,17.8,16.1,12.4,10.1,14.0
25%,22.5,23.0,23.8,23.0,23.8,22.0,24.0,24.0,24.5,22.6,24.2,23.3
50%,23.8,24.0,24.0,23.5,24.7,22.5,24.9,25.0,25.4,24.0,24.8,24.0
75%,24.5,24.9,24.9,24.0,25.5,23.0,25.5,25.9,26.3,24.8,25.4,24.5
max,28.0,29.2,27.4,29.0,28.0,27.5,28.3,30.0,31.3,29.0,28.2,27.5


In [29]:
# tmax
ncei_tmax_df["DATE"].min(), ncei_tmax_df["DATE"].max()

('2002-01-01', '2022-12-31')

In [33]:
ncei_tmax_df.shape[0]

7666

In [30]:
ncei_tmax_df.describe()

Unnamed: 0,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
count,6071.0,7616.0,7666.0,3302.0,7605.0,4379.0,7634.0,7662.0,7548.0,7242.0,5498.0,7644.0
mean,33.222978,32.158508,32.239721,31.446245,31.018974,31.649692,31.444878,32.080423,31.738725,32.084383,31.242761,32.880782
std,2.441805,2.074814,1.629757,1.965037,1.914239,1.858564,1.588969,2.120211,1.993907,2.162481,1.927079,1.538638
min,23.0,25.0,23.1,25.2,23.5,23.2,24.4,23.0,23.1,22.8,23.6,25.0
25%,31.9,31.0,31.4,30.2,30.0,30.8,30.5,30.8,30.4,30.8,30.3,32.0
50%,33.2,32.0,32.4,31.5,31.2,31.8,31.5,32.0,31.8,32.2,31.6,33.0
75%,34.8,33.5,33.2,32.6,32.4,32.8,32.5,33.4,33.0,33.5,32.5,34.0
max,40.4,40.8,39.9,39.6,38.8,40.4,38.2,39.0,39.8,39.4,38.6,39.6


## Data Prep

In [14]:
# convert columns to date type
ncei_tmean_df["DATE"] = pd.to_datetime(ncei_tmean_df["DATE"])
ncei_tmin_df["DATE"] = pd.to_datetime(ncei_tmin_df["DATE"])
ncei_tmax_df["DATE"] = pd.to_datetime(ncei_tmax_df["DATE"])
ncei_wind_df["DATE"] = pd.to_datetime(ncei_wind_df["DATE"])

In [15]:
# dumangas
# add TMEAN (average of Tmin and Tmax)

dumangas_df = PAGASA_DFS["Dumangas"]
dumangas_df["TMEAN"] = dumangas_df[["TMAX", "TMIN"]].mean(axis=1)

In [16]:
# ADD DATE for stations
for station in stations_list:
    df = PAGASA_DFS[station]
    df["DATE"] = pd.to_datetime(dict(year=df["YEAR"], month=df["MONTH"], day=df["DAY"]))

## Mean Temperature

### Distributions

In [None]:
get_distribution_plots(ncei_tmean_df, 2018, "TMEAN")

In [None]:
get_distribution_plots(ncei_tmean_df, 2008, "TMEAN")

### Quick Error Statistics

In [None]:
get_error_df(2018, ncei_tmean_df)

In [None]:
get_error_df(2008, ncei_tmean_df)

## Min Temperature

### Distributions

In [None]:
get_distribution_plots(ncei_tmin_df, 2018, "TMIN")

In [None]:
get_distribution_plots(ncei_tmin_df, 2008, "TMIN")

### Error Statistics

In [None]:
get_error_df(2018, ncei_tmin_df)

In [None]:
get_error_df(2008, ncei_tmin_df)

## Max Temperature

### Distributions

In [None]:
get_distribution_plots(ncei_tmax_df, 2018, "TMAX")

In [None]:
get_distribution_plots(ncei_tmax_df, 2008, "TMAX")

### Error Statistics

In [None]:
get_error_df(2018, ncei_tmax_df)

In [130]:
get_error_df(2008, ncei_tmax_df)

Unnamed: 0,MAPE,MAE,RMSE
Cabanatuan,0.154631,4.341304,4.525303
Dagupan,0.158397,4.346196,4.474936
Davao City,0.13553,3.775543,3.846717
Dumangas,0.123191,13.959783,75.851954
Legazpi,0.099567,2.788043,2.898107
Lumbia-El Salvador,0.178474,4.747541,4.820085
Mactan,0.117059,3.301087,3.386289
NAIA,0.110292,3.094565,3.185975
Port Area,0.091967,2.620652,2.730524
Science Garden,0.135307,3.742391,3.902048


## Wind Speeds

### Distributions

In [None]:
get_distribution_plots(ncei_wind_df, 2018, "WIND_SPEED")

In [None]:
get_distribution_plots(ncei_wind_df, 2008, "WIND_SPEED")

### Error Statistics

In [None]:
get_error_df(2018, ncei_wind_df)

In [42]:
get_error_df(2008, ncei_wind_df)

Unnamed: 0,MAPE,MAE,RMSE
Cabanatuan,0.984076,27.531522,27.564083
Dagupan,0.91258,25.067935,25.106481
Davao City,0.936925,26.013043,26.03105
Dumangas,0.924052,35.694565,76.066124
Legazpi,0.911544,25.461413,25.524942
Lumbia-El Salvador,0.956999,25.408743,25.418231
Mactan,0.929733,26.246196,26.275432
NAIA,0.921702,25.796196,25.837674
Port Area,0.906689,25.778261,25.813529
Science Garden,0.969782,26.776087,26.812416


## EDA

### Investigate Lumbia-El Salvador

In [20]:
filter_year_ncei_tmean = ncei_tmean_df[
    (ncei_tmean_df["DATE"].dt.year == 2018) & (ncei_tmean_df["DATE"].dt.month >= 7)
]
filter_year_ncei_tmean

Unnamed: 0,DATE,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
6024,2018-07-01,28.1,28.2,27.9,,28.1,,28.1,28.0,28.4,28.2,28.4,28.8
6025,2018-07-02,27.6,28.2,27.3,,29.0,,28.3,28.3,28.6,28.8,27.9,28.7
6026,2018-07-03,28.4,29.2,28.2,,29.3,,28.1,28.4,28.7,28.3,26.7,28.3
6027,2018-07-04,27.6,28.3,27.3,,28.7,,28.2,28.4,28.8,28.9,26.6,28.1
6028,2018-07-05,27.7,28.0,26.8,,29.0,,27.4,28.6,28.7,28.7,26.9,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6203,2018-12-27,28.2,27.6,27.9,,25.7,,27.4,27.7,27.7,27.1,25.0,28.1
6204,2018-12-28,26.0,25.7,28.7,,25.3,,25.6,25.3,26.1,25.4,24.5,28.1
6205,2018-12-29,24.4,24.5,28.4,,25.6,,27.0,24.4,24.8,23.8,27.1,28.2
6206,2018-12-30,25.7,25.4,27.4,,26.7,,27.1,24.4,24.5,24.1,27.7,28.8


In [21]:
filter_year_ncei_tmean["Lumbia-El Salvador"].isna().value_counts()

True    184
Name: Lumbia-El Salvador, dtype: int64

In [22]:
filter_year_ncei_tmean["Dumangas"].isna().value_counts()

True    184
Name: Dumangas, dtype: int64

### Windspeed investigation

In [40]:
filter_year_ncei_wind = ncei_wind_df[
    (ncei_wind_df["DATE"].dt.year == 2008) & (ncei_wind_df["DATE"].dt.month >= 7)
]

In [154]:
ncei_wind_df.describe()

Unnamed: 0,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
count,6071.0,7616.0,7666.0,3302.0,7606.0,4379.0,7634.0,7662.0,7548.0,7242.0,5498.0,7644.0
mean,73.992868,3.164772,2.215954,2.152544,13.016724,1.26844,3.068614,2.738841,7.0938,17.734493,3.84247,1.743969
std,179.293179,19.46667,0.672418,1.191304,73.273988,0.497898,1.240838,0.960237,46.923941,90.764278,35.87826,0.688232
min,0.0,0.2,0.2,0.0,0.0,0.0,0.3,0.2,0.2,0.0,0.0,0.0
25%,0.5,1.9,1.7,1.2,1.6,1.0,2.1,2.1,2.3,0.8,0.8,1.3
50%,1.0,2.3,2.2,2.1,2.2,1.2,2.9,2.6,2.6,1.1,1.2,1.6
75%,2.0,2.8,2.6,3.0,2.9,1.5,3.8,3.3,3.1,1.5,1.6,2.1
max,514.4,514.4,5.2,9.4,514.4,7.0,10.6,11.1,514.4,514.4,514.4,6.0


In [158]:
ncei_wind_df == 514.40

Unnamed: 0,DATE,Cabanatuan,Dagupan,Davao Airport,Dumangas,Legazpi,Lumbia-El Salvador,Mactan Airport,NAIA,Port Area,Science Garden,Tacloban,Zamboanga
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7661,False,False,False,False,False,False,False,False,False,False,False,False,False
7662,False,False,False,False,False,False,False,False,False,False,False,False,False
7663,False,False,False,False,False,False,False,False,False,False,False,False,False
7664,False,False,False,False,False,False,False,False,False,False,False,False,False


In [31]:
filter_year_ncei_wind["Dumangas"].isna().value_counts()

False    184
Name: Dumangas, dtype: int64

In [32]:
filter_year_ncei_wind["Lumbia-El Salvador"].isna().value_counts()

False    183
True       1
Name: Lumbia-El Salvador, dtype: int64

In [33]:
filter_year_ncei_wind["Tacloban"].isna().value_counts()

False    184
Name: Tacloban, dtype: int64

In [35]:
tac_test = PAGASA_DFS["Tacloban"]
tac_test["WIND_SPEED"].isna().value_counts()

False    368
True       1
Name: WIND_SPEED, dtype: int64

In [37]:
tac_test = PAGASA_DFS["Lumbia-El Salvador"]
tac_test["WIND_SPEED"].isna().value_counts()

False    368
True       1
Name: WIND_SPEED, dtype: int64