In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from config import GOOGLE_PATH, WEATHER_PATH, POP_PATH, FLU_PATH
import json
from datetime import datetime

In [11]:
# Import data
flu = pd.read_csv('../data/INFLUENZA_sentinella/data.csv')
weather = pd.read_csv('../data/weather/reg_weather.csv')
google_flu = pd.read_csv('../data/google_search_trend/reg_google_grippe.csv')
google_symptoms = pd.read_csv('../data/google_search_trend/reg_google_fieber_husten.csv')
pop = pd.read_csv('../data/pop_data_cantons/weekly_imputed_pop_data_final.csv')

In [5]:
# Create dataframe for country-level observations, no differentiation between sex or age
flu_CH = flu.query('georegion_type == "country" and agegroup == "all" and sex == "all"').copy()

# Create dataframe for regional observations, no differentiation between sex or age
flu_reg = flu.query('georegion_type == "sentinella_region" and agegroup == "all" and sex == "all"').copy()

# Drop unknown region entries from flu_region using mask
flu_reg = flu_reg[~(flu_reg['georegion'] == 'unknown')]

# Select columns required for analysis
selected_cols = ['temporal', 'georegion', 'popExtrapolation']
flu_reg = flu_reg[selected_cols]
flu_reg.rename(columns={'popExtrapolation':'flu_cases'}, inplace=True)

In [6]:
# Align time-indexes of google data and flu data
with open('date_dict.json', 'r') as f:
    date_dict = json.load(f)

google_flu['Woche'] = google_flu['Woche'].apply(lambda x: date_dict[x])
google_symptoms['Woche'] = google_symptoms['Woche'].apply(lambda x: date_dict[x])

In [7]:
dates = weather.date.values
# Convert dates to 'YYYY-Www' ISO week format
iso_week_dates = [datetime.strptime(date, '%Y-%m-%d').isocalendar()[:2] for date in dates]
iso_week_dates = [f'{year}-W{week:02d}' for year, week in iso_week_dates]
weather['date'] = iso_week_dates

In [8]:
# Reshape google_flu from wide to long
google_flu = google_flu.melt(id_vars=['Woche'], var_name='region_query', value_name='search_activity')

# Separate region and query information from header into separate rows
google_flu['region'] = google_flu['region_query'].apply(lambda x: "_".join(x.split('_')[:2]))
google_flu['query'] = google_flu['region_query'].apply(lambda x: "_".join(x.split('_')[2:]))
google_flu.drop(columns='region_query', inplace=True) # Drop superfluous region_query column

# Reshape dataframe to get separate columns for each variable
google_flu = google_flu.pivot(index=['Woche', 'region'], columns='query', values='search_activity').reset_index()

In [9]:
# Reshape google_flu from wide to long
google_symptoms = google_symptoms.melt(id_vars=['Woche'], var_name='region_query', value_name='search_activity')

# Separate region and query information from header into separate rows
google_symptoms['region'] = google_symptoms['region_query'].apply(lambda x: "_".join(x.split('_')[:2]))
google_symptoms['query'] = google_symptoms['region_query'].apply(lambda x: "_".join(x.split('_')[2:]))
google_symptoms.drop(columns='region_query', inplace=True) # Drop superfluous region_query column

# Reshape dataframe to get separate columns for each variable
google_symptoms = google_symptoms.pivot(index=['Woche', 'region'], columns='query', values='search_activity').reset_index()

In [10]:
pop

Unnamed: 0,Kanton,Bevölkerung 1. Januar,Geburten,Todesfälle,Überschuss,Wanderungssaldo international 1,Wanderungssaldo interkantonal,Bevölkerung 31. Dezember,total,% over Year,...,Ausserhalb des Einflusses städtischer Kerne,BIP in Millionen CHF,BIP Growth Laufende Preise,BIP Growth Preise Vorjahr,BIP pro Kopf,Woche,Bevölkerung,% over week,Jahr-Woche,Pop Density
0,Aargau,627340,124.0,88.0,36.0,95.0,47.0,636362,174.0,1.438136,...,93486.0,41229.16645,2.28242,1.93227,64788.856736,1,6.273400e+05,,2013-W01,446.823362
1,Aargau,627340,124.0,88.0,36.0,95.0,47.0,636362,174.0,1.438136,...,93512.0,41229.16645,2.28242,1.93227,64788.856736,2,6.275169e+05,0.028199,2013-W02,446.949360
2,Aargau,627340,124.0,88.0,36.0,95.0,47.0,636362,174.0,1.438136,...,93538.0,41229.16645,2.28242,1.93227,64788.856736,3,6.276938e+05,0.028191,2013-W03,447.075359
3,Aargau,627340,124.0,88.0,36.0,95.0,47.0,636362,174.0,1.438136,...,93565.0,41229.16645,2.28242,1.93227,64788.856736,4,6.278707e+05,0.028183,2013-W04,447.201357
4,Aargau,627340,124.0,88.0,36.0,95.0,47.0,636362,174.0,1.438136,...,93591.0,41229.16645,2.28242,1.93227,64788.856736,5,6.280476e+05,0.028175,2013-W05,447.327356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14089,Zürich,1564662,297.0,232.0,65.0,315.0,-68.0,1579967,294.0,0.978167,...,10401.0,,,,,48,1.578767e+06,0.019012,2022-W48,913.109663
14090,Zürich,1564662,297.0,232.0,65.0,315.0,-68.0,1579967,294.0,0.978167,...,10403.0,,,,,49,1.579067e+06,0.019008,2022-W49,913.283231
14091,Zürich,1564662,297.0,232.0,65.0,315.0,-68.0,1579967,294.0,0.978167,...,10405.0,,,,,50,1.579367e+06,0.019005,2022-W50,913.456798
14092,Zürich,1564662,297.0,232.0,65.0,315.0,-68.0,1579967,294.0,0.978167,...,10407.0,,,,,51,1.579667e+06,0.019001,2022-W51,913.630366


In [79]:
merged_data = pd.merge(flu_reg, weather, how='left', left_on=['temporal', 'georegion'], right_on=['date', 'region']).sort_values(by=['georegion', 'temporal'])
merged_google = pd.merge(google_flu, google_symptoms, how='inner', on=['region', 'Woche'])
merged_data = pd.merge(merged_data, merged_google, how='left', left_on=['georegion', 'temporal'], right_on=['region', 'Woche'])

In [88]:
merged_data.columns

Index(['temporal', 'georegion', 'flu_cases', 'region_x', 'date',
       'weighted_avg_daily_temp', 'weighted_avg_daily_hum',
       'weighted_max_daily_temp', 'weighted_min_daily_temp',
       'weighted_max_daily_hum', 'weighted_min_daily_hum', 'Woche', 'region_y',
       'Flu', 'Flu_resid', 'Flu_seasonal', 'Flu_trend', 'Grippe',
       'Grippe_resid', 'Grippe_seasonal', 'Grippe_trend', 'Influenza',
       'Influenza_resid', 'Influenza_seasonal', 'Influenza_trend', 'influenza',
       'influenza_resid', 'influenza_seasonal', 'influenza_trend', 'Fieber',
       'Fieber_resid', 'Fieber_seasonal', 'Fieber_trend', 'fièvre',
       'fièvre_resid', 'fièvre_seasonal', 'fièvre_trend', 'husten',
       'husten_resid', 'husten_seasonal', 'husten_trend', 'toux', 'toux_resid',
       'toux_seasonal', 'toux_trend'],
      dtype='object')

In [89]:
merged_data.drop(columns=['region_x', 'region_y', 'date', 'Woche'], inplace=True)

In [90]:
merged_data

Unnamed: 0,temporal,georegion,flu_cases,weighted_avg_daily_temp,weighted_avg_daily_hum,weighted_max_daily_temp,weighted_min_daily_temp,weighted_max_daily_hum,weighted_min_daily_hum,Flu,...,fièvre_seasonal,fièvre_trend,husten,husten_resid,husten_seasonal,husten_trend,toux,toux_resid,toux_seasonal,toux_trend
0,2013-W01,region_1,16876.1,3.044123,82.918337,6.597668,-1.041057,95.043840,63.721907,,...,,,,,,,,,,
1,2013-W02,region_1,33045.6,1.899121,85.732108,4.005557,-0.232400,94.972297,74.365709,6.694163,...,7.554113,20.241099,3.724624,-6.176678,2.571313,7.329989,30.496017,2.831515,13.093592,14.570910
2,2013-W03,region_1,37204.9,-1.508554,77.031993,0.870462,-4.233867,90.875307,63.741066,15.731896,...,-5.079976,20.163192,7.977211,3.287695,-2.697615,7.387131,10.827069,-0.599317,-3.118453,14.544839
3,2013-W04,region_1,36421.1,-0.759157,77.826791,2.469204,-4.891194,90.988158,60.461963,12.886953,...,-3.220311,20.085231,16.034321,3.831175,4.759199,7.443947,20.094055,-1.412146,6.987013,14.519188
4,2013-W05,region_1,39004.7,4.551592,78.802057,7.993836,0.627968,92.950733,58.114673,20.251145,...,8.369157,20.007216,3.797554,-6.717551,3.014675,7.500430,11.163364,-1.808321,-1.522283,14.493968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403,2023-W42,region_6,14349.7,,,,,,,0.000000,...,0.000000,0.000000,14.808454,7.342648,-4.320877,11.786683,0.000000,-0.022954,-0.426565,0.449520
3404,2023-W43,region_6,13789.4,,,,,,,0.000000,...,0.000000,0.000000,5.925534,-0.478980,-5.410189,11.814702,0.000000,-0.171670,-0.269120,0.440791
3405,2023-W44,region_6,9232.8,,,,,,,0.000000,...,0.000000,0.000000,5.925534,0.224958,-6.142114,11.842689,41.462083,41.346561,-0.316517,0.432039
3406,2023-W45,region_6,13692.8,,,,,,,0.000000,...,0.000000,0.000000,12.838656,-0.333561,1.301569,11.870648,0.000000,-0.190492,-0.232774,0.423265


In [7]:
def create_lagged_features(df, column, number_of_lags=4):
    # Group by 'georegion' and apply the lagging within each group
    new_df = df.groupby('georegion', group_keys=False).apply(lambda group: group.assign(
        **{f'lag_{i}': group[column].shift(i) for i in range(1, number_of_lags + 1)}
    ))

    # Reset index to flatten the dataframe structure after grouping
    new_df.reset_index(drop=True, inplace=True)

    return new_df

# Create lagged features
df_lagged = create_lagged_features(flu_reg, column='flu_cases')

# Drop rows with NaN values (due to lagging)
df_lagged.dropna(inplace=True)

In [205]:
df_lagged.sort_values(by=['georegion', 'temporal'])

Unnamed: 0,temporal,georegion,flu_cases,lag_1,lag_2,lag_3,lag_4
24,2013-W05,region_1,39004.7,36421.1,37204.9,33045.6,16876.1
30,2013-W06,region_1,40782.5,39004.7,36421.1,37204.9,33045.6
36,2013-W07,region_1,36076.5,40782.5,39004.7,36421.1,37204.9
42,2013-W08,region_1,32491.9,36076.5,40782.5,39004.7,36421.1
48,2013-W09,region_1,34254.9,32491.9,36076.5,40782.5,39004.7
...,...,...,...,...,...,...,...
3383,2023-W42,region_6,14349.7,14887.7,13808.3,13698.0,13493.3
3389,2023-W43,region_6,13789.4,14349.7,14887.7,13808.3,13698.0
3395,2023-W44,region_6,9232.8,13789.4,14349.7,14887.7,13808.3
3401,2023-W45,region_6,13692.8,9232.8,13789.4,14349.7,14887.7


In [147]:
# Define features and target variable
X = df_lagged.drop('flu_cases', axis=1).values  # Features (including lagged flu cases)
y = df_lagged['flu_cases'].values  # Target

# Split the data (80%-20% split for example)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

NameError: name 'df' is not defined