# Forecasting

In [None]:
#load hourly resampled & merged df
import pandas as pd
import urllib.request
urllib.request.urlretrieve('https://docs.google.com/uc?export=download&id=1--eG9u-siOXAwVvB_P_t4gxkzYcwILMa', 'merge_export41_meteoLC102_hourly.csv')
df_hourly = pd.read_csv('merge_export41_meteoLC102_hourly.csv', index_col = 0)
df_hourly.head()

## Distinguishing between true missing values and no-event 'observations'
The time series seems to contain many missing values for noise event variables. Not all of these are true missing values, since not registering an event may simply mean there is no event. In an effort to distinguish between true missing values and no-event 'observations', we first determine the missing time periods. Considering only the gaps > 1 day, we note the following:

for most locations, gaps are nearly always >= 6 days, the few exceptions have a length of 1-2 days. These exceptions may still (largely) consist of true no-event 'observations'

-> (arbitrary) cutoff at 2 days?

MP08bis - Vrijthof is the anomaly with 55 missing periods (vs max 5 for other locations), half of which are < 3 days, 75% of which are < 6 days. This is likely the result of its location vs that of the other locations (courtyard of the Town Hall vs along the Naamsestraat).

-> (very arbitrary) cutoff at 7 days?

On another note: Interestingly, most of the (supported) events registered at Vrijthof are classified as human singing. This may be linked to the nearby presence of Het Radiohuis.

In [None]:
import numpy as np
from datetime import timedelta

def add_end(ts):
  upper = pd.DataFrame(ts.iloc[0:1,:].replace([0,1], np.nan))   
  upper['begin_date'] = pd.to_datetime('2023-01-01 00:00:00')
  return pd.concat([ts, upper])

def missing_or_noevent(ts, cutoff = 2, cutoff_Vrijthof = 7):
  '''
  This function fills some of the nans in the noise event variables with zeroes, based on the specified cutoff values. 
  The argument 'cutoff_Vrijthof' is used to specify the maximum length of a time period with missing values (in days!)
  before it is considered truly missing for MP08bis, 'cutoff' does the same for all other MPs.
  It returns two DataFrames: the adapted input DataFrame, and a DataFrame with all the missing time periods. 
  The latter is not filtered by the specified cutoffs. 
  'Unsupported' category is assumed to consist of unclassifiable events and is thus treated as an event, not missing.
  '''

  # Get time series sampling frequency
  ts['timestamp'] = pd.to_datetime(ts['timestamp'])
  ts = ts.sort_values(['location_csv','timestamp']).reset_index(drop=True)
  freq = pd.to_timedelta(ts.loc[1, 'timestamp'] - ts.loc[0, 'timestamp'])

  # Construct a df with the missing time periods
  missing_time = pd.DataFrame()
  df = ts.dropna(subset = 'human_noise')
  missing_time[['location_csv', 'begin_date']] = df[['location_csv', 'timestamp']]
  missing_time = missing_time.groupby('location_csv').apply(add_end).reset_index(drop=True)
  missing_time = missing_time.sort_values(["location_csv","begin_date"]).reset_index(drop=True)
  missing_time['end_date'] = missing_time['begin_date'].shift(-1)
  missing_time['begin_date'] = missing_time['begin_date'] + freq
  missing_time['timedelta'] = missing_time.groupby("location_csv")["begin_date"].diff().shift(-1)
  missing_time = missing_time.dropna()
  missing_time = missing_time[missing_time['timedelta'] > freq]
  missing_time['timedelta'] = missing_time['timedelta'] - freq

  # Filter by cutoff values
  true_na_vh = missing_time.loc[(missing_time['timedelta'] > timedelta(days = cutoff_Vrijthof)) & (missing_time['location_csv'] == '280324_mp08bis---vrijthof.csv')]
  true_na_other = missing_time.loc[(missing_time['timedelta'] > timedelta(days = cutoff)) & (missing_time['location_csv'] != '280324_mp08bis---vrijthof.csv')]


  col_to_fill = ts.columns[ts.columns.str.contains('noise')].values.tolist()

  ts_vh = ts.loc[ts['location_csv'] == '280324_mp08bis---vrijthof.csv' ].copy()
  ts_other = ts.loc[ts['location_csv'] != '280324_mp08bis---vrijthof.csv'].copy()

  
  # Add column true_na yes/no (less intervals to check for true nans than false nans)
  # If timestamp not in true_na, replace any nans with 0s

    #Vrijthof
  ts_vh['true_na'] = ts_vh['timestamp'].apply(lambda t: any((true_na_vh["begin_date"] <= t) & (true_na_vh["end_date"] > t)))
  ts_vh.loc[ts_vh['true_na'] == 0, col_to_fill] = ts_vh.loc[ts_vh['true_na'] == 0, col_to_fill].fillna(0)

    #other MPs
  other = ts_other['location_csv'].drop_duplicates().tolist()
  for MP in other:
    ts_other.loc[ts_other['location_csv'] == MP,'true_na'] = ts_other.loc[ts_other['location_csv'] == MP, 'timestamp'] \
                          .apply(lambda t: any((true_na_other['location_csv'] == MP) &(true_na_other["begin_date"] <= t) & (true_na_other["end_date"] > t)))
  ts_other.loc[ts_other['true_na'] == 0, col_to_fill] = ts_other.loc[ts_other['true_na'] == 0, col_to_fill].fillna(0)

  df = pd.concat([ts_vh, ts_other]).drop('true_na', axis = 1)  

  return df, missing_time

df, missing = missing_or_noevent(df_hourly)

## Feature building
Could still be added: window features, such as 'number of time periods with noise event, out of the last 6 time periods' 

In [None]:
import holidays
from datetime import timedelta

# DEV: 
## GPT prompt:
## if I have a pandas dataframe with timestamps (UTC), can you suggest some additional features to engineer? (for example: weekend, weekday, Belgian holiday, season, ...)
## Can you provide python3 code on an imaginary dataframe?

df["timestamp"] = pd.to_datetime(df["timestamp"])

# Add weekday/weekend feature
df["is_weekend"] = df["timestamp"].dt.weekday.isin([5, 6]).astype(int)

# Add day of the week feature
df["day_of_week"] = df["timestamp"].dt.day_name()

# Add hour of the day feature
df["hour_of_day"] = df["timestamp"].dt.hour

# Add time of day feature
df["time_of_day"] = pd.cut(
    df["timestamp"].dt.hour,
    bins=[-1, 6, 12, 18, 24],
    labels=["Night", "Morning", "Afternoon", "Evening"],
)  # equal bins

# Add season feature
df["season"] = pd.cut(
    df["timestamp"].dt.month,
    bins=[0, 3, 6, 9, 12],
    labels=["Winter", "Spring", "Summer", "Fall"],
)

# Add month feature
df["month"] = df["timestamp"].dt.month_name()

# Add day of the month feature
df["day_of_month"] = df["timestamp"].dt.day

# Add quarter feature
df["quarter"] = "Q" + df["timestamp"].dt.quarter.astype(str)

# Add Belgian holidays feature
be_holidays = holidays.BE()
df["is_be_holiday"] = (
    df["timestamp"].dt.date.astype("datetime64").isin(be_holidays).astype(int)
)

# Add business day feature
df["is_business_day"] = ~df["timestamp"].dt.weekday.isin([5, 6]) & ~df["is_be_holiday"]


In [None]:
# Add feature indicating whether the response variable is missing
df['missing'] = df['human_noise'].isna()

# Create dataframe with exam & vacation dates based on the academic calendars 2021-2022 & 2022-2023 of the KU Leuven & KU Leuven Group T 
# https://www.kuleuven.be/over-kuleuven/kalenders/kalenders-21-22 & https://www.kuleuven.be/over-kuleuven/kalenders 
# vacation = periods with no lessons or exams lasting at least 1 week
kul_ac_year = pd.DataFrame({"begin_date": ["2022-01-10", "2022-05-30", "2022-01-10", "2022-05-30",
                                           "2022-01-31", "2022-06-27", "2022-01-01", "2022-02-05",
                                           "2022-04-02", "2022-07-02", "2022-12-24"],
                    "end_date": ["2022-02-04", "2022-07-01", "2022-01-30", "2022-06-26", 
                                 "2022-02-04", "2022-07-01", "2022-01-13", "2022-02-13", 
                                 "2022-04-18", "2022-09-25", "2022-12-31"],
                    "type": ["exams", "exams", "first_exam_weeks", "first_exam_weeks", 
                             "final_exam_week", "final_exam_week", "vacation", "vacation", 
                             "vacation", "vacation", "vacation"]})

kul_ac_year["begin_date"] = pd.to_datetime(kul_ac_year["begin_date"])
kul_ac_year["end_date"] = pd.to_datetime(kul_ac_year["end_date"])
kul_ac_year["end_date"] = kul_ac_year["end_date"] + pd.Timedelta('23:59:59')

kul_ac_year

# Add (university) exams feature (only first & second exam period)
df["exams"] = df["timestamp"].apply(lambda t: any((kul_ac_year["type"] == "exams") & (kul_ac_year["begin_date"] <= t) & (kul_ac_year["end_date"] >= t)))

  # Alternatively add 2 features, one for first, 'normal', exam weeks, one for the last week
df["first_exam_weeks"] = df["timestamp"].apply(lambda t: any((kul_ac_year["type"] == "first_exam_weeks") & (kul_ac_year["begin_date"] <= t) & (kul_ac_year["end_date"] >= t)))
df["final_exam_week"] = df["timestamp"].apply(lambda t: any((kul_ac_year["type"] == "final_exam_week") & (kul_ac_year["begin_date"] <= t) & (kul_ac_year["end_date"] >= t)))

# Add student vacation periods feature
df["student_vacation"] = df["timestamp"].apply(lambda t: any((kul_ac_year["type"] == "vacation") & (kul_ac_year["begin_date"] <= t) & (kul_ac_year["end_date"] >= t)))
df.head()

## Forecasting with skforecast and LightGBM

In [None]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import recall_score, precision_score, f1_score

from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.utils import save_forecaster, load_forecaster

In [None]:
#Create input dfs
# Drop location 08bis due to no human noise events occuring during validation month (November)
df = df[df['location_csv'] != '280324_mp08bis---vrijthof.csv']

# Create separate df with target time series 
df['timestamp'] = pd.to_datetime(df['timestamp'])
ts_data = pd.pivot_table(data = df, values = 'human_noise', index = 'timestamp', columns = 'location_csv')

# Additional category for missing response -> multiclass for some locations
ts_data = ts_data.fillna(2)
ts_data = ts_data.astype('category')

# Drop last row (not real data) 
ts_data.drop(index=ts_data.index[-1],axis=0,inplace=True)

# Explicitly set hourly frequency of datetime index
ts_data.index.freq = 'H'

# Create dictionary with exogenous data for every location 
# necessary because missing values are location-specific
locations = df['location_csv'].drop_duplicates().tolist()
exog_datasets = {}

for location in locations:
  # Subset by location
  somename = df.loc[df['location_csv'] == location,:]

  # Create df with exogenous vars
  cat_exogs = ['missing',
              'is_weekend',
              'day_of_week',
              'time_of_day',
              'is_be_holiday',
              'is_business_day',
              'season',
              'month',
              'first_exam_weeks', 
              'final_exam_week',
              'student_vacation']
  num_exogs = ['hour_of_day',
              'LC_TEMP_QCL3', 
              'LC_WINDSPEED', 
              'LC_RAININ']

  exog_data  = somename[cat_exogs + num_exogs + ['timestamp']].set_index('timestamp').sort_index()
  exog_data[cat_exogs] = exog_data[cat_exogs].astype('category')

  # Drop last row (not real data) 
  exog_data.drop(index=exog_data.index[-1],axis=0,inplace=True)

  # Explicitly set hourly frequency of datetime index
  exog_data.index.freq = 'H'

  # Add to dictionary
  exog_datasets[location] = exog_data