In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import random
from datetime import datetime

In [None]:
df = pd.read_csv('../../../data/smooth_df.csv')

date_column = 'Date'
date_number_column = 'Date Number'
ili_rate_column = 'ILI Rate'
query_columns = [col for col in df.columns if col not in [date_column, date_number_column, ili_rate_column]]

df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

print(df.shape)

In [None]:
def custom_time_series_split(df, date_column):
    # Custom time series split based on years
    years = df[date_column].dt.year.unique()
    splits = [(years[5], years[i]) for i in range(10, len(years) - 4)]
    return splits

def get_train_test_split_data(X_shifted, y_shifted, y_original, test_start_date, test_end_date):
    # Get the test data for a specific split
    test_indices = (X_shifted[date_column] >= test_start_date) & (X_shifted[date_column] <= test_end_date)
    y_pred, y_test = y_original[test_indices], y_shifted[test_indices]
    return (y_pred, y_test)

def min_max_data(X_actual_train, X_train, X_val, X_test):
    scaler = MinMaxScaler()
    scaler.fit(X_actual_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    return X_train_scaled, X_val_scaled, X_test_scaled

In [None]:
def evaluate_persistence_model(y_pred, y_test):
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    pearson_corr, _ = pearsonr(y_test, y_pred)
    print("MAE: ", mae, "MAPE: ", mape, "P: ", pearson_corr)
    return mae, mape, pearson_corr

In [None]:
def add_average_row(df):
    mae_avg = np.mean(df['MAE'])
    mae_std = np.std(df['MAE'])
    pearson_corr_avg = np.mean(df['Pearson_Correlation'])
    mape_avg = np.mean(df['MAPE'])

    df.loc[len(df)] = {
        'Year': 'Average',
        'MAE': str(mae_avg) + ' +- ' + str(mae_std),
        'Pearson_Correlation': pearson_corr_avg,
        'MAPE': mape_avg
    }

def train_presistence_model(queries, forecasting_horizon, threshold):
    X = df[[date_column]]
    y = df[ili_rate_column]

    y_shifted = y.shift(-forecasting_horizon)
    y_shifted = y_shifted.iloc[:-forecasting_horizon]

    y_original = y.iloc[:-forecasting_horizon]

    X_shifted = X.iloc[:-forecasting_horizon]

    splits = custom_time_series_split(df, date_column)

    model_performance = pd.DataFrame(
        columns=['Year', 'MAE', 'Pearson_Correlation', 'MAPE']
    )

    model_predictions = pd.DataFrame(columns=['Date', 'Actual_ILI_Rate', 'Predicted_ILI_Rate'])

    for train_start_year, test_start_year in splits:
        print(f'TRAIN START YEAR: {train_start_year}, TEST YEAR: {test_start_year}-{test_start_year+1}')
        print('number of features: ', len(queries))

        # Get training and test data for the current split
        test_start_date = f'{test_start_year}-09-01'
        test_end_date = f'{test_start_year + 1}-08-31'
        print("y_original: ", y_original.shape, "y_shifted: ", y_shifted.shape)
        y_pred, y_test = get_train_test_split_data(X_shifted, y_shifted, y_original, test_start_date, test_end_date)
        print("y_pred: ", y_pred.shape, "y_test: ", y_test.shape)
        mae, mape, pearson_corr = evaluate_persistence_model(y_pred, y_test)

        # Record the performance for this split
        model_performance.loc[len(model_performance)] = {
            'Year': f'{test_start_year}-{test_start_year + 1}',
            'MAE': mae,
            'Pearson_Correlation': pearson_corr,
            'MAPE': mape,
        }

        # Create a DataFrame of predicted ILI rates
        date_range = pd.date_range(start=f'{test_start_year}-09-{forecasting_horizon+1}', periods=len(y_pred))
        iteration_predictions = pd.DataFrame({
            'Date': date_range,
            'Predicted_ILI_Rate': y_pred,
            'Actual_ILI_Rate': y_test,
        })

        model_predictions = pd.concat([model_predictions, iteration_predictions])
        print('\n\n')

    add_average_row(model_performance)
    return model_performance, model_predictions

In [None]:
query_similarities = pd.read_csv('../../sentence_embedding_feature_selection/results/average.csv').iloc[:1000]['Query'].to_list()
forecasting_horizon = 28
model_performance, model_predictions = train_presistence_model(query_similarities, forecasting_horizon, threshold=0.3)
model_performance.to_csv(f'../../../model_results/forecasting/persistence/forecasting_t+{forecasting_horizon}_performance.csv')
model_predictions.to_csv(f'../../../model_results/forecasting/persistence/forecasting_t+{forecasting_horizon}_predictions.csv')
print(model_performance)

In [None]:
def define_periods(year):
    periods = {
        2014: [
            (datetime(2014, 9, 1), datetime(2014, 12, 1)),  # Onset
            (datetime(2014, 12, 2), datetime(2015, 1, 20)),  # Peak
            (datetime(2015, 1, 21), datetime(2015, 8, 31))  # Tail
        ],
        2015: [
            (datetime(2015, 9, 1), datetime(2015, 12, 30)),  # Onset
            (datetime(2015, 12, 31), datetime(2016, 3, 24)),  # Peak
            (datetime(2016, 3, 25), datetime(2016, 8, 31))  # Tail
        ],
        2016: [
            (datetime(2016, 9, 1), datetime(2016, 12, 1)),  # Onset
            (datetime(2016, 12, 2), datetime(2017, 1, 30)),  # Peak
            (datetime(2017, 1, 31), datetime(2017, 8, 31))  # Tail
        ],
        2017: [
            (datetime(2017, 9, 1), datetime(2017, 12, 30)),  # Onset
            (datetime(2017, 12, 31), datetime(2018, 2, 8)),  # Peak
            (datetime(2018, 2, 9), datetime(2018, 8, 31))  # Tail
        ],
        2018: [
            (datetime(2018, 9, 1), datetime(2018, 12, 28)),  # Onset
            (datetime(2018, 12, 29), datetime(2019, 2, 18)),  # Peak
            (datetime(2019, 2, 19), datetime(2019, 8, 31))  # Tail
        ]
    }

    return periods[year]

def analyse_flu_seasons(data, start_year, end_year, forecasting_horizon):
    season_results = []
    
    for year in range(start_year, end_year):
        season_data = data[(data['Date'] >= datetime(year, 9, 1)) & (data['Date'] <= datetime(year + 1, 8, 31))]
        periods = define_periods(year)
        
        for start, end in periods:
            period_data = season_data[(season_data['Date'] >= start) & (season_data['Date'] <= end)]
            if not period_data.empty:
                y_test = period_data['Actual_ILI_Rate']
                y_pred = period_data['Predicted_ILI_Rate']
                mae = mean_absolute_error(y_test, y_pred)
                period_name = f"{start.strftime('%Y-%m-%d')} - {end.strftime('%Y-%m-%d')}"
                season_results.append({'Forecasting Horizon': forecasting_horizon, 'Flu Season': f'{year}-{year+1}', 'Period': period_name, 'MAE': mae})
    
    return pd.DataFrame(season_results)

# Define the list of forecasting horizons
forecasting_horizons = [7, 14, 21, 28]

# Initialize an empty list to store DataFrame results
all_results = []

# Iterate over each forecasting horizon
for forecasting_horizon in forecasting_horizons:
    # Read the model predictions from the CSV file
    model_predictions = pd.read_csv(f'../../../model_results/forecasting/persistence/forecasting_t+{forecasting_horizon}_predictions.csv')
    model_predictions['Date'] = pd.to_datetime(model_predictions['Date'])
    
    # Analyze flu seasons and append results to the list
    flu_season_results = analyse_flu_seasons(model_predictions, 2014, 2019, forecasting_horizon)
    all_results.append(flu_season_results)

# Concatenate all DataFrame results into a single DataFrame
final_results = pd.concat(all_results)

# Save results to a single CSV file
final_results.to_csv(f'../../../model_results/forecasting/persistence/forecasting_period_performance.csv', index=False)