In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures

In [None]:
# read multiple (x, 1) csv files into a single datframe
def merge_csvs(file_names, folder_name):
    return [
        pd.read_csv(folder_name + file_name + '.csv')
        for file_name in file_names
    ]

# construct ILI rates df
ili_rates_folder = '../../data/influenza_rates_data/'
ili_files = ['weekStarts', 'weekEnds', 'y']
ili_df = pd.concat(merge_csvs(ili_files, ili_rates_folder), axis=1)

# construct GSQ frequency df
gsq_freqs_folder = '../../data/csv_data/google_search_query_data/'
gsq_files = ['dates', 'datenums']

queries = pd.read_csv(gsq_freqs_folder + 'queries.csv')
qf = pd.read_csv(gsq_freqs_folder + 'Q.csv', header=None)
qf.columns = queries['Query']

gsq_df = pd.concat(merge_csvs(gsq_files, gsq_freqs_folder) + [qf], axis=1)


In [None]:
# Create a new dataframe for interpolated daily ILI rates
interpolated_data = []

# interpolates the ILI rates for all dates between two thursdays for two rows
def interpolate_ili_rate(row1, row2):
    start1 = int(row1['Week Start'])
    end1 = int(row1['Week End'])
    thurs1 = start1 + ((end1 - start1) // 2)
    ili_rate1 = row1['ILI Rate']

    start2 = int(row2['Week Start'])
    end2 = int(row2['Week End'])
    thurs2 = start2 + ((end2 - start2) // 2)
    ili_rate2 = row2['ILI Rate']

    for date in range(thurs1, thurs2):
        # linear interpolation formual: y = y1 + (((y2 - y1) / (x2 - x1)) * (x - x1))
        interpolated_rate = ili_rate1 + (((ili_rate2 - ili_rate1) / 7) * (date - thurs1))
        interpolated_data.append({'Date Number': date, 'ILI Rate': interpolated_rate})

# inerpolate the dates between the Thursdays of every 2 adjacnet rows (weeks)
for i in range(len(ili_df) - 1):
    interpolate_ili_rate(ili_df.iloc[i], ili_df.iloc[i + 1])

daily_interpolated_ili_df = pd.DataFrame(interpolated_data)
daily_interpolated_ili_df.reset_index(drop=True, inplace=True)

# Create a new dataframe for weekly ILI rates with Thursdays
weekly_thursdays = []

for i in range(len(ili_df)):
    row = ili_df.iloc[i]
    start = int(row['Week Start'])
    end = int(row['Week End'])
    thurs = start + ((end - start) // 2)
    ili_rate = row['ILI Rate']
    weekly_thursdays.append({'Date Number': thurs, 'ILI Rate': ili_rate})

weekly_thursdays_df = pd.DataFrame(weekly_thursdays)

# Plotting the weekly ILI rates (Thursdays)
plt.figure(figsize=(8, 4))
plt.plot(weekly_thursdays_df['Date Number'], weekly_thursdays_df['ILI Rate'], linestyle='-', color='b')
plt.title('Weekly ILI Rates (Thursdays)')
plt.xlabel('Date Number')
plt.ylabel('ILI Rate')
plt.tight_layout()
plt.savefig('graphs/weekly_ili_rates_thursdays.png')

# Plotting the daily ILI rates
plt.figure(figsize=(8, 4))
plt.plot(daily_interpolated_ili_df['Date Number'], daily_interpolated_ili_df['ILI Rate'], linestyle='-', color='r')
plt.title('Daily ILI Rates (Interpolated)')
plt.xlabel('Date Number')
plt.ylabel('ILI Rate')
plt.tight_layout()
plt.savefig('graphs/daily_ili_rates_interpolated.png')

plt.show()

In [None]:
# filters dataframe by date
def filter_df(df, start_date_num, end_date_num):
    return (
        df[(df['Date Number'] >= start_date_num) & (df['Date Number'] <= end_date_num)]
    )

start_date_num = 731951
end_date_num = 738883

# filter data frames to contain intersecting data
filtered_ili_df = filter_df(daily_interpolated_ili_df, start_date_num, end_date_num)
filtered_gsq_df = filter_df(gsq_df, start_date_num, end_date_num)

filtered_gsq_df.loc[:, 'Date'] = pd.to_datetime(filtered_gsq_df['Date'], format='%Y%m%d')

filtered_ili_df.reset_index(drop=True, inplace=True)
filtered_gsq_df.reset_index(drop=True, inplace=True)

final_df = pd.concat([filtered_gsq_df, filtered_ili_df['ILI Rate']], axis=1)

print(final_df.shape)

In [None]:
# Find columns with all zeros
zero_columns = final_df.columns[2:-1][(final_df.iloc[:, 2:-1] == 0).all()]

# Drop the columns with all zeros
final_df.drop(columns=zero_columns, inplace=True)

# Optionally reset the index if needed
final_df.reset_index(drop=True, inplace=True)

print(final_df.shape)

In [None]:
date_column = 'Date'
date_number_column = 'Date Number'
ili_rate_column = 'ILI Rate'
query_columns = [col for col in final_df.columns if col not in [date_column, date_number_column, ili_rate_column]]

final_df['Date'] = pd.to_datetime(final_df['Date'], format='%Y-%m-%d')

In [None]:
X = final_df[[date_column] + query_columns]
y = final_df[ili_rate_column]

train_start_date = '2009-09-01'
test_start_date = '2014-09-01'

train_indices = (X[date_column] >= train_start_date) & (X[date_column] < test_start_date)
X_train = X[train_indices].iloc[:, 1:]

zero_cols_train = X_train.columns[(X_train == 0).all()]
zero_cols_train_present = set(zero_cols_train).intersection(set(final_df.columns))

final_df.drop(columns=zero_cols_train_present, inplace=True)
final_df.reset_index(drop=True, inplace=True)

print(final_df.shape)

In [None]:
query_columns = [col for col in final_df.columns if col not in [date_column, date_number_column, ili_rate_column]]

duplicates = {}

for query_column in query_columns:
    sort_lower_query = ' '.join(sorted(query_column.lower().split()))
    if sort_lower_query not in duplicates:
        duplicates[sort_lower_query] = []
    duplicates[sort_lower_query].append(query_column)

non_dup_query_columns = []
num_dups_removed = 0

for key, same_queries in duplicates.items():
    if len(same_queries) > 1:
        num_dups_removed += len(same_queries) - 1
    non_dup_query_columns.append(same_queries[0])

print(len(query_columns))
print(num_dups_removed)
print(len(non_dup_query_columns))

final_df = final_df[[date_column, date_number_column] + non_dup_query_columns + [ili_rate_column]]
print(final_df.shape)

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(final_df['Date'], final_df['flu medicine'], linestyle='-', color='b')
plt.title('Flu Medicine Query frequencies')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('graphs/flu_medicine_query_frequencies.png')

plt.figure(figsize=(8, 4))
plt.plot(final_df['Date'], final_df['flu nhs'], linestyle='-', color='b')
plt.title('Flu NHS Query frequencies')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('graphs/flu_nhs_query_frequencies.png')

In [None]:
window_size = 14

# Weighted average smoothing function
def weighted_average(arr):
    n = len(arr)
    weights = 1 / np.arange(1, n + 1)[::-1]  # Generate weights in a vectorized way
    weighted_sum = np.dot(arr, weights)
    weight_sum = weights.sum()
    return weighted_sum / weight_sum

# Apply rolling weighted average to each query column
for index, query in enumerate(non_dup_query_columns):
    print(index, query)
    final_df[query] = final_df[query].rolling(window=window_size, min_periods=1).apply(weighted_average)

final_df.to_csv('../../data/smooth_df.csv', index=False)

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(final_df['Date'], final_df['flu medicine'], linestyle='-', color='r')
plt.title('Smoothed Flu Medicine Query frequencies')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('graphs/smoothed_flu_medicine_query_frequencies.png')

plt.figure(figsize=(8, 4))
plt.plot(final_df['Date'], final_df['flu nhs'], linestyle='-', color='r')
plt.title('Smoothed Flu NHS Query frequencies')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('graphs/smoothed_flu_nhs_query_frequencies.png')