# Time Series Example

In [None]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

from src.data import make_dataset
from pathlib import Path

# path to the root of the repo
project_dir = Path().resolve().parents[0]

## Read data

Lets import the data, dont do anything with it, just look at it.

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv(project_dir / 'data' / 'raw' / 'jena_climate_2009_2016.csv')

df

In [None]:
df.describe()

In [None]:
df.info()

## Wrangling

Now let's dive a little deeper.

In [None]:
# Date Time (object) to datetime (datetime64)
df['datetime'] = pd.to_datetime(df['Date Time'], format='%d.%m.%Y %H:%M:%S')

df

### Check date range

It looks like the date time column has a 10 minute interval, lets check this

In [None]:
date_range = pd.date_range(
    start=df['datetime'].min(),
    end=df['datetime'].max(),
    freq='10min'
)

if df.shape[0] != date_range.shape[0]:
    print(f"the dataframe ({df.shape[0]}) and date_range ({date_range.shape[0]}) do NOT have the same length.")

even if the dataframe has the same length as the date_range, there could still be something wrong:
- if the number of duplicates matches the number of missing
- if the number of duplicates fills the missing values

### Handle duplicate values

In [None]:
# show the duplicate 'datetime' entries
df[df['datetime'].duplicated(keep=False)].sort_values(by='datetime')

So there are 327 rows with duplicate (or more) datetimes, let's drop the duplicates

In [None]:
# drop the duplicates
df = df.drop_duplicates(keep='first')

print(f"df.shape == {df.shape}")

# sanity check (there should be 0 duplicates now)
df[df['datetime'].duplicated(keep=False)].sort_values(by='datetime')

### Handle index

In [None]:
# set the index
ts = df.set_index('datetime')

# show the new index
ts.index

The fact that `freq = None` means that there is no complete frequency, so there are values missing (we've already removed the duplicates). Lets fix the missing.

### Handle missing values

In [None]:
# drop the "old" date time column
ts = ts.drop(columns="Date Time")

# resample (fill missing values)
ts = ts.resample('10min').ffill()

# sanity check (check if the index is correct)
ts.index

In [None]:
# sanity check (there should be 0 records with missing values)
ts[ts.isna().any(axis=1)]

Nice, now the start & end date matches as well as the frequency. 

### Visualise the temperature

In [None]:
# Using graph_objects
import plotly.graph_objects as go

go.Figure([go.Scatter(
    x=ts.index, 
    y=ts['T (degC)'])]
).update_layout(
    title='Timeseries', 
    xaxis_title="Date time",
    yaxis_title="T (degC)",
).update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1d", step="day", stepmode="backward"),
            dict(count=7, label="1w", step="day", stepmode="backward"),
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
).show()

## Analyse

### Decompose

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
# import plotly.io as pio
from statsmodels.tsa.seasonal import DecomposeResult

def plot_seasonal_decompose(result:DecomposeResult, dates:pd.Series=None, title:str="Seasonal Decomposition"):
    # pio.templates.default = "plotly_white"

    x_values = dates if dates is not None else np.arange(len(result.observed))
    fig = (
        make_subplots(
            rows=4,
            cols=1,
            shared_xaxes=True,
            vertical_spacing=0.025
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.observed, mode="lines", name='Observed'),
            row=1,
            col=1,
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.trend, mode="lines", name='Trend'),
            row=2,
            col=1,
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.seasonal, mode="lines", name='Seasonal'),
            row=3,
            col=1,
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.resid, mode="lines", name='Residual'),
            row=4,
            col=1,
        )
        .update_layout(
            height=600, 
            title=title, 
            showlegend=False
        )
    )

    # edit axis labels
    fig['layout']['yaxis']['title']='Observed'
    fig['layout']['yaxis2']['title']='Trend'
    fig['layout']['yaxis3']['title']='Seasonal'
    fig['layout']['yaxis4']['title']='Residuals'

    return fig

In [None]:
import statsmodels.api as sm

# statsmodels cant handle the 10 minute '10T' frequency from pandas so we need set manually the period (1 year)
period = int((365*24*60)/10)

# decompose
decomposition = sm.tsa.seasonal_decompose(
    x=ts['T (degC)'],
    period=period
)

# show decomposition
fig = plot_seasonal_decompose(decomposition, dates=ts.index)
fig.show()