# Model Evaluation template

You can follow this python script to make some basic model evaluation exercises.

## 0. Import necessary python packages and import the prewritten functions.

In [None]:
%pip install -q nbformat plotly statsmodels

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os

# import hvplot.xarray
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

## 1. Load the data

We have simulated and observed data for SO2 (ug/m3), SO4 (ugS/m3), NH4 (ugN/m3), NO3 (ugN/m3), and PM2.5 (ug/m3) at three stations: Villum, Spitzbergen, and Alert.

In [None]:
# We totally have the following three stations available:
allstations = ['Villum','Spitzbergen','Alert']

# A function to get the data of one station
# Do not change below function, unless you know what you are doing.
def get_station_data(station):
    input_file = f'data/Arctic_aerosol_{station}.csv'

    df = pd.read_csv(input_file,header=0)
    # Convert date column to datetime format
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')

    return df

In [None]:
station = 'Villum'  # Choose from 'Villum','Spitzbergen', and 'Alert'
df = get_station_data(station)

# Print out the first 5 lines
print(df.head(5))

#### What do you observe from the data? For example: time resolution, time period, components, data missing etc.

## 2. Plot the time series

In [None]:
# Function to plot the variable
# Do not change below function, unless you know what you are doing.
def plot_time_series(var,station,df=None,stddf=None):
    df = get_station_data(station) if df is None else df

    fig = go.Figure()
    
    
    if stddf is not None:
        # Add simulation line
        fig.add_trace(go.Scatter(y=df[f'{var}_mod'],x=df.index,name='model simulation',\
                    error_y=dict(type='data',array=stddf[f'{var}_mod'].values)))

        # Add observation line
        if f'{var}_mea' in df.columns:
            fig.add_trace(go.Scatter(y=df[f'{var}_mea'],x=df.index,name='measurement',\
                    error_y=dict(type='data',array=stddf[f'{var}_mod'].values)))

    else:
        # Add simulation line
        fig.add_trace(go.Scatter(y=df[f'{var}_mod'],x=df.index,name='model simulation'))

        # Add observation line
        if f'{var}_mea' in df.columns:
            fig.add_trace(go.Scatter(y=df[f'{var}_mea'],x=df.index,name='measurement'))


    # Title and legend
    fig.update_layout(title = f'{var} time series at {station}')
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
        ))
    fig.update_layout(showlegend=True) 
    fig.show()


In [None]:
chemical,station = 'SO4','Villum'   # Choose from 'SO2', 'SO4', 'NH4', 'NO3', and 'PM2.5' for chemical, 
                                    # and 'Villum','Spitzbergen','Alert' for station
plot_time_series(chemical,station)

#### Please change the chemical and the station to take a look at the time series of different chemicals and different stations.

Tips:

1. You can copy the cell down when you want to look at a new chemical or new location, so that you can keep all figures.

2. You can download the plots you want for exercise report, and maybe for your final report. Download option is on the top-right option panel of the figure.

## 3. Interannual variation (annual mean)

In [None]:
# Look at annual mean
chemical,station = 'SO4','Villum'   # Choose from 'SO2', 'SO4', 'NH4', 'NO3', and 'PM2.5' for chemical, 
                                    # and 'Villum','Spitzbergen','Alert' for station
df = get_station_data(station)
annual_df = df.groupby(pd.Grouper(freq='1Y')).mean()

# Without standard deviation
plot_time_series(chemical,station,df=annual_df)

In [None]:
annual_std_df = df.groupby(pd.Grouper(freq='1Y')).std()

# With standard deviation
plot_time_series(chemical,station,df=annual_df,stddf=annual_std_df)

#### What do you found from the interannual variations? Are they showing the same characteristics for different chemicals or different stations?

## 4. Seasonal cycle

In [None]:
chemical,station = 'SO4','Villum'   # Choose from 'SO2', 'SO4', 'NH4', 'NO3', and 'PM2.5' for chemical, 
                                    # and 'Villum','Spitzbergen','Alert' for station
df['date'] = pd.to_datetime(df.index)
seasonal_df = df.groupby(df.date.dt.month).mean()

plot_time_series(chemical,station,df=seasonal_df)

In [None]:
seasonal_std_df = df.groupby(df.date.dt.month).std()

# Add standard deviation
plot_time_series(chemical,station,df=seasonal_df,stddf=seasonal_std_df)

#### How would you describe their seasonal cycle? Are they showing the same characteristics for different chemicals or different stations?

## 5. Evaluations

### 1-1 paring scatter plot

In [None]:
# Function to plot the scatter plot
# Do not change below function, unless you know what you are doing.
def plot_scatter(var,station,df=None):
    df = get_station_data(station) if df is None else df
    if f'{var}_mea' not in df.columns:
        print(f'No measurement of {var} available')
        return
    
    vmin,vmax = 0,max(df[f'{var}_mea'].max(),df[f'{var}_mod'].max())*1.02
    fig = px.scatter(df, x=f'{var}_mea', y=f'{var}_mod', \
                marginal_x='histogram', marginal_y='histogram',\
                trendline="ols",\
                width=800,height=800)
    fig.update_traces(histnorm='probability', selector={'type':'histogram'})

    fig.update_layout(title = f'{var} model simulation vs measurement at {station}')
    fig.update_layout(yaxis=dict(range=[vmin,vmax]))
    fig.update_layout(xaxis=dict(range=[vmin,vmax]))
    fig.show()

    results = px.get_trendline_results(fig)
    print(results.px_fit_results.iloc[0].summary())

In [None]:
chemical,station = 'SO4','Villum'   # Choose from 'SO2', 'SO4', 'NH4', 'NO3', and 'PM2.5' for chemical, 
                                    # and 'Villum','Spitzbergen','Alert' for station
plot_scatter(chemical,station)

### Residuals (model simulations minus measured values)

In [None]:
# Function to plot the scatter plot
# Do not change below function, unless you know what you are doing.
def plot_residual(var,station,df=None):
    df = get_station_data(station) if df is None else df
    if f'{var}_mea' not in df.columns:
        print(f'No measurement of {var} available')
        return
    
    vmin,vmax = 0,max(df[f'{var}_mea'].max(),df[f'{var}_mod'].max())*1.02
    df['residual'] = df[f'{var}_mod'] - df[f'{var}_mea']

    fig = px.scatter(df, x=f'{var}_mea', y='residual',
                marginal_y='violin', trendline='ols',
                width=1000,height=800)

    fig.update_layout(title = f'{var} model residuals (simulated - measured) at {station}')

    fig.show()

In [None]:
chemical,station = 'SO4','Villum'   # Choose from 'SO2', 'SO4', 'NH4', 'NO3', and 'PM2.5' for chemical, 
                                    # and 'Villum','Spitzbergen','Alert' for station
plot_residual(chemical,station)

#### Take a look at these metrics, do you know the meaning of each of them? (Google is always a good friend)

#### Based on these statistic values of evaluation metrics. How would you describe the model?