# Forecasting Influenza-Like Illness in Kent County, Michigan
## A time-series analysis

---

*Setting up code environment*

In [None]:
import pandas as pd
import plotly.express as px
from datetime import datetime
from epiweeks import Week, Year
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
from statsmodels.tsa.stattools import adfuller
import statsmodels.tsa.stattools as smt
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL

In [None]:
## Some helpful functions ##

def plot_ccf_sm(target, exog, unbiased=False, nlags=10):
    """Plot CCF using Statsmodels"""
    ccfs = smt.ccf(target, exog, unbiased=False)[:nlags+1]
    lags = np.arange(len(ccfs))[:nlags+1]
    _ = plt.stem(lags, ccfs, use_line_collection=True)
    _ = plt.title(f"Cross Correlation (Statsmodels): {target.name} & {exog.name}")

def crosscorr(x: pd.Series, y: pd.Series, lag: int=0) -> float:
    """ Lag-N cross correlation. 
    Shifted data (y) filled with NaNs 
    """
    return x.corr(y.shift(lag))


def plot_ccf_manual(target, exog, nlags=10):
    """PLot CCF using manual calculations"""
    lags = []
    ccfs = []
    for i in np.arange(0,nlags+1):
        lags.append(i)
        ccfs.append(crosscorr(target, exog, lag=i))

    _ = plt.stem(lags, ccfs, use_line_collection=True)
    _ = plt.title(f"Cross Correlation (Manual): {target.name} & {exog.name}")

def difference(dataset, n):
    """ this function intakes a list of values, 
    subtracts the nth prior value,
    and returns the list of differenced values.
    """
    return pd.Series([dataset[i] - dataset[i - n] for i in range(1, len(dataset))])

def ADF(time_series, max_lags):
    t_stat, p_value, lags, _, critical_values, _ = adfuller(
    time_series,
    maxlag=max_lags
    )
    print(f'ADF Statistic: {t_stat:.2f}')
    print(f'p-value: {p_value:.2f}')
    print(f'lags: {lags}')
    for key, value in critical_values.items():
        print('Critial Values:')
        print(f'   {key}, {value:.2f}')

---

Monthly means will be calculated for each continuous variable alongside box&whiskers plots to visualize variable spreads. 

---

Associations between month and main pollutant will be explored with Chi2 test of association and visualized with a stacked bar chart. 

---

Continuous variablesâ€™ multicollinearity will be assessed with Spearman correlation and Variance Inflation Factors

---

ILI cases will be modeled with months alone and with months + other variables to explore the impact of the other variables on cases.

---

The stationarity of ILI cases will be tested using the Adfuller Test of Stationarity. If the series fails to show stationarity, another Adfuller test will be performed on the first-differenced data.

---

If the cases series or its first-differenced transformation are indeed stationary, an autoregression model will be developed to explore any lagged-effects of past weeks in the data.

---

Delayed-effects of independent variables on ILI cases will be explored with cross-correlation plots to select lagged terms which may have an influence on cases. 

---

These lagged terms will be included in a multiple linear regression model, alongside auto-lags, months, and other terms

---