# Import Libraries

In [None]:
%load_ext autoreload
%autoreload 2

## Data Manipulation

In [None]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose

import helper

## Data Visualization

In [None]:
import plotly.express as px
import plotly.offline as po
import plotly.graph_objects as go

In [None]:
PLOT_WIDTH, PLOT_HEIGHT = (1000, 500)

In [None]:
po.init_notebook_mode(connected=True)
pd.options.plotting.backend = "plotly"

In [None]:
helper.__init()

# Data Preprocessing

## Load Data

In [None]:
google = helper.fetch_data_from_database(table="mobility_stats_google", where_column="country_region_code", where_value="GR", order_by='date')
# covid_df = helper.fetch_data_from_database(table="covid_data_greece")

## Data Cleaning

In [None]:
google = helper.clean_df(google)
google.drop(labels='country_region_code', axis=1, inplace=True)

In [None]:
# Drop samples that have more than 20% of missing values
google = google.dropna(thresh=google.shape[1] * 0.8, axis=0).reset_index(drop=True)

In [None]:
# Fill remaining null values with forward-fill method
num_cols = [col for col in google.columns[2:]]
google[num_cols] = google.loc[:, num_cols].fillna(method='ffill')

In [None]:
# Aggregate all regions by date and get the mean value
google = google.groupby(by='date', as_index=False).mean()

In [None]:
# Augment data by adding date-specific columns
google['month'] = google['date'].dt.month
google['year'] = google['date'].dt.year
google['day_name'] = google.date.dt.day_name()

## Check skewness of the features

In [None]:
num_cols = num_cols[:-2]
google[num_cols].skew().sort_values(ascending=False)

In [None]:
skewed_cols = [col for col in num_cols if np.abs(google[col].skew()) > 0.5]

In [None]:
fig = px.histogram(
    data_frame=google,
    x=skewed_cols,
    histnorm='percent',
    nbins=50,
    facet_col='variable',
    facet_col_wrap=3,
    facet_col_spacing=0.03,
    width=len(skewed_cols) * 600,
    height=PLOT_HEIGHT,
)
fig.update_layout(
    showlegend=False,
    bargap=0.2,
    yaxis=dict(title_text="Frequency"),
)
fig.update_xaxes(title_text="Value", position=0.5)
fig.update_yaxes(matches=None, showticklabels=True, ticksuffix="%")
fig.for_each_annotation(
    lambda a: a.update(
        text=helper.TITLES[google.columns.get_loc(a.text.split("=")[-1]) - 1]
    )
)

fig.show()
fig.write_image("../plots/Skewed Features (Histogram).svg")

## Focus on Parks and Outdoor Spaces

In [None]:
ts_parks = google[['date', 'parks_percent_change_from_baseline']].copy()
ts_parks.set_index('date', inplace=True)
ts_parks.index.freq = 'D'

In [None]:
px.box(
    ts_parks, x='parks_percent_change_from_baseline', 
    width=PLOT_WIDTH, height=PLOT_HEIGHT,
    labels={'parks_percent_change_from_baseline': helper.TITLES[2]}
)

In [None]:
# Standardize Data 
z_data = (ts_parks - ts_parks.mean()) / ts_parks.std() 
shifted_z = z_data - z_data.shift(30)

### Seasonal Decomposition

In [None]:
from plotly.subplots import make_subplots

In [None]:
result = seasonal_decompose(x=z_data, model='additive', period=30, two_sided=False)
trend = result.trend.to_frame()
seasonality = result.seasonal.to_frame()
residual = result.resid.to_frame()

In [None]:
fig = make_subplots(4, 1)
index = z_data.index
fig.add_trace(
    go.Scatter(
        x=index, y=z_data['parks_percent_change_from_baseline'],
        mode='lines',
        name='Observed',
    ),
    row=1, col=1   
)

fig.add_trace(
    go.Scatter(
        x=index, y=trend['trend'],
        mode='lines',
        name='Trend'
    ),
    row=2, col=1   
)

fig.add_trace(
     go.Scatter(
         x=index, y=seasonality['seasonal'],
         mode='lines',
         name='Seasonality'
    ),
    row=3, col=1   
)

fig.add_trace(
     go.Scatter(
         x=index, y=residual['resid'],
         mode='lines',
         name='Residual'
    ),
    row=4, col=1   
)

fig.update_layout(
    title=dict(
        text="Seasonal Decomposition",
        x=0.5,
        xanchor='center',
        yanchor='top',
        font=dict(
            family="Courier New, monospace",
            size=25,
            color="RebeccaPurple"
        ),
    ),
    width=PLOT_WIDTH * 1.3, height=PLOT_HEIGHT * 2.5,
)

In [None]:
de_trended = (z_data['parks_percent_change_from_baseline'] - trend['trend']).to_frame()

In [None]:
fig = go.Figure()
fig.add_traces(
    [
        go.Scatter(x=ts_parks.index.values, y=z_data['parks_percent_change_from_baseline'], name="raw data"),
        go.Scatter(x=ts_parks.index.values, y=trend['parks_percent_change_from_baseline'], name="rolling mean"),
        go.Scatter(x=ts_parks.index.values, y=z_data.rolling(window=30).std()['parks_percent_change_from_baseline'], name="rolling std"),
    ]
)
fig.update_layout(
    xaxis=dict(title="Date", tickformat = '%b %Y'),
    title=helper.TITLES[2],
    width=PLOT_WIDTH, height=PLOT_HEIGHT
)
fig.show()
fig.write_image("../plots/Parks/Rolling Statistics.svg")

In [None]:
fig = z_data.diff().plot(
    kind='line',
    width=PLOT_WIDTH, height=PLOT_HEIGHT,
    title=helper.TITLES[2],
    labels={'date': 'Data', 'value': ''}
)
fig.update_layout(showlegend=False)

fig.show()

In [None]:
fig = px.box(
    google, 
    x='month',
    y='parks_percent_change_from_baseline',
    color='month', 
    width=PLOT_WIDTH, height=PLOT_HEIGHT,
    title='Month-wise Mobility on Parks'
)

fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1,13)),
        ticktext=[datetime.date(2000, i, 1).strftime('%b') for i in range(1, 13)],
        title_text='Months'
    ),
    yaxis=dict(title_text=helper.TITLES[2])
)

fig.show()
fig.write_image('../plots/Parks/Monthly Box Plot.svg')

## Check Stationarity

### Augmented Dickey-Fuller Test

In [None]:
ts_diff = pd.Series(z_data['parks_percent_change_from_baseline'])

In [None]:
d = 0

while helper.test_stationarity(ts_diff, nlags=d, confidence=0.01):
    ts_diff = ts_diff.diff().dropna()
    d += 1

### Autocorrelation plots (ACF & PACF)

In [None]:
FIG_SIZE = (20, 7)

In [None]:
fig, ax = plt.subplots(figsize=FIG_SIZE)
plot_acf(ts_diff, ax=ax, lags=50, zero=False)
plt.show();

In [None]:
fig, ax = plt.subplots(figsize=FIG_SIZE)
plot_pacf(ts_diff, ax=ax, lags=50, method='ywm', zero=False)
plt.show()

In [None]:
order = (7, 1, 7)
seasonal_order = (7, 1, 0, 30)

model = SARIMAX(endog=ts_diff, order=order, seasonal_order=seasonal_order)
model.fit(method='powell')

In [None]:
print(model.summary())

In [None]:
res = model.resid
fig, ax = plt.subplots(2,1, figsize=(20, 10))
fig = plot_acf(res, lags=50, ax=ax[0])
fig = plot_pacf(res, lags=50, ax=ax[1], method='ywm')
plt.show()

# Focus on first wave period of the pandemic

In [None]:
google_daily = google.groupby(by="date").mean().reset_index()

START_DATE = "2020-03-23"
END_DATE = "2020-05-04"

fw_trends = helper.select_daterange(google_daily, START_DATE, END_DATE).reset_index(drop=True)
fw_stats = helper.select_daterange(covid_df, START_DATE, END_DATE).reset_index(drop=True)

## Histograms of the features

In [None]:
cols_of_interest = [col for col in fw_trends.columns[1:]]

fw_trends[cols_of_interest].plot.hist(subplots=True, 
                                      bins=25, 
                                      figsize=(20, 10), layout=(-1, 3), 
                                      edgecolor='black', 
                                      grid=False,
                                      sharex=True,
                                      legend=False, 
                                      title=helper.TITLES
                              );

## Skewness

In [None]:
# Check the skewness for each of the features
fw_trends[fw_trends.columns].skew(skipna=True, numeric_only=True).sort_values(ascending=False)

### TODO: Check if normalizing is needed

In [None]:
skewed = fw_trends[['retail_and_recreation_percent_change_from_baseline', 'workplaces_percent_change_from_baseline']]
(skewed - skewed.min()) / (skewed.max() - skewed.min())

## Feature Correlation

In [None]:
# Using Spearman rank correlation instead of Pearson to deal with outliers
spearman_corr = fw_trends.iloc[:, 1:].corr(method='spearman')

fig = helper.px.imshow(spearman_corr,
                       color_continuous_scale='RdBu_r', text_auto='.2f',
                       height=600
                      )
fig.update_layout(
    title='Feature Relationships',
    xaxis=dict(
        tickmode='array',
        tickvals=fw_trends.columns[1:],
        ticktext=helper.TITLES,
        tickangle=-30
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=fw_trends.columns[1:],
        ticktext=helper.TITLES,
    )
)
fig.show()

### Examine the distribution of the features with the highest correlation

In [None]:
fw_trends[
    [
        "grocery_and_pharmacy_percent_change_from_baseline",
        "transit_stations_percent_change_from_baseline",
    ]
].plot.hist(figsize=(10, 8), alpha=0.5);


### Scatterplot of the features with LOWESS trendline

In [None]:
def trendline_scatter(df, X, Y, trend_line):
    fig = helper.px.scatter(data_frame=df, x=X, y=Y, trendline=trend_line)
    x_index=df.columns.get_loc(X) - 1
    y_index=df.columns.get_loc(Y) - 1
    fig.update_layout(
        xaxis_title_text=helper.TITLES[x_index], 
        yaxis_title_text=helper.TITLES[y_index],
        height=500
    )

    fig.show()
    return fig

In [None]:
results = trendline_scatter(
    fw_trends,
    "retail_and_recreation_percent_change_from_baseline",
    "transit_stations_percent_change_from_baseline",
    "lowess",
)