# Model Evaluation template

You can follow this python script to make some basic model evaluation exercises.

## 0. Import necessary python packages and import the prewritten functions.

In [None]:
%pip install -q nbformat plotly statsmodels

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import os

# import hvplot.xarray
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

## 1. Load the data

We have weekly data of simulated and observed SO2 (ug/m3), SO4 (ugS/m3), NH4 (ugN/m3), NO3 (ugN/m3), and PM2.5 (ug/m3) at three stations: Villum, Spitzbergen, and Alert.

In [28]:
# We totally have the following three stations available:
allstations = ['Villum','Spitzbergen','Alert']

# A function to get the data of one station
def get_station_data(station):
    input_file = f'data/Arctic_aerosol_{station}.csv'

    df = pd.read_csv(input_file,header=0)
    # Convert date column to datetime format
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')

    return df

In [29]:
station = 'Alert'
df = get_station_data(station)
print(df.head(5))

            SO2_mod  SO2_mea  SO4_mod  SO4_mea  NH4_mod  NH4_mea  NO3_mod  \
date                                                                        
2007-01-01    0.405      NaN    0.080    0.288    0.024    0.072    0.037   
2007-01-08    1.700      NaN    0.241      NaN    0.045      NaN    0.070   
2007-01-15    0.973      NaN    0.250    0.208    0.037    0.033    0.051   
2007-01-22    0.330      NaN    0.115    0.230    0.034    0.044    0.058   
2007-01-29    0.408      NaN    0.116    0.253    0.036    0.056    0.043   

            NO3_mea  PM2.5_mod  
date                            
2007-01-01    0.026      1.510  
2007-01-08      NaN      2.416  
2007-01-15    0.018      3.583  
2007-01-22    0.027      1.545  
2007-01-29    0.048      4.677  


#### What do you observe from the data?

## 2. Plot the time series

In [38]:
# Function to plot the variable

def plot_time_series(var,station,df=None,stddf=None):
    df = get_station_data(station) if df is None else df

    fig = go.Figure()
    
    
    if stddf is not None:
        # Add simulation line
        fig.add_trace(go.Scatter(y=df[f'{var}_mod'],x=df.index,name='model simulation',\
                    error_y=dict(type='data',array=stddf[f'{var}_mod'].values)))

        # Add observation line
        if f'{var}_mea' in df.columns:
            fig.add_trace(go.Scatter(y=df[f'{var}_mea'],x=df.index,name='measurement',\
                    error_y=dict(type='data',array=stddf[f'{var}_mod'].values)))

    else:
        # Add simulation line
        fig.add_trace(go.Scatter(y=df[f'{var}_mod'],x=df.index,name='model simulation'))

        # Add observation line
        if f'{var}_mea' in df.columns:
            fig.add_trace(go.Scatter(y=df[f'{var}_mea'],x=df.index,name='measurement'))


    # Title and legend
    fig.update_layout(title = f'{var} time series at {station}')
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
        ))
    fig.update_layout(showlegend=True) 
    fig.show()


In [31]:
chemical,station = 'SO4','Villum'   # Choose from SO2, SO4, NH4, NO3, and PM2.5
plot_time_series(chemical,station)

#### Please change the chemical and the station to take a look at the time series of different chemicals and different stations.

Tips:

1. You can copy the cell down when you want to look at a new chemical or new location, so that you can keep all figures.

2. You can download the plots you want for exercise report, and maybe for your final report. Download option is on the top-right option panel of the figure.

## 3. Interannual variation (annual mean)

In [42]:
# Look at annual mean
df = get_station_data(station)
annual_df = df.groupby(pd.Grouper(freq='1Y')).mean()

# Without standard deviation
plot_time_series(chemical,station,df=annual_df)


In [43]:
annual_std_df = df.groupby(pd.Grouper(freq='1Y')).std()

# With standard deviation
plot_time_series(chemical,station,df=annual_df,stddf=annual_std_df)

#### What do you found from the interannual variations? Are they showing the same characteristics for different chemicals or different stations?

## 4. Seasonal cycle

In [44]:
df['date'] = pd.to_datetime(df.index)
seasonal_df = df.groupby(df.date.dt.month).mean()

plot_time_series(chemical,station,df=seasonal_df)

In [45]:
seasonal_std_df = df.groupby(df.date.dt.month).std()

# Add standard deviation
plot_time_series(chemical,station,df=seasonal_df,stddf=seasonal_std_df)

#### How would you describe their seasonal cycle? Are they showing the same characteristics for different chemicals or different stations?

## 3. 1-1 scatter plot

In [17]:
def plot_scatter(var,station,df=None):
    df = get_station_data(station) if df is None else df
    if f'{var}_mea' not in df.columns:
        print(f'No measurement of {var} available')
        return
    
    vmin,vmax = 0,max(df[f'{var}_mea'].max(),df[f'{var}_mod'].max())*1.02
    fig = px.scatter(df, x=f'{var}_mea', y=f'{var}_mod', trendline="ols",width=800,height=800)
    fig.update_layout(title = f'{var} measurement vs model simulation at {station}')
    fig.update_layout(yaxis=dict(range=[vmin,vmax]))
    fig.update_layout(xaxis=dict(range=[vmin,vmax]))
    fig.show()

    results = px.get_trendline_results(fig)
    print(results.px_fit_results.iloc[0].summary())


In [18]:
chemical,station = 'NH4','Villum'
plot_scatter(chemical,station)


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.419
Model:                            OLS   Adj. R-squared:                  0.418
Method:                 Least Squares   F-statistic:                     271.6
Date:                Sun, 05 Feb 2023   Prob (F-statistic):           2.58e-46
Time:                        13:20:54   Log-Likelihood:                 985.70
No. Observations:                 378   AIC:                            -1967.
Df Residuals:                     376   BIC:                            -1960.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0206      0.001     16.013      0.0

#### Based on these statistic values of evaluation metrics. How would you describe the model?