In [None]:
import torch
print(torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from torch import nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import random
from datetime import datetime

In [None]:
df = pd.read_csv('../../../data/smooth_df.csv')

date_column = 'Date'
date_number_column = 'Date Number'
ili_rate_column = 'ILI Rate'
query_columns = [col for col in df.columns if col not in [date_column, date_number_column, ili_rate_column]]

df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

print(df.shape)

In [None]:
def custom_time_series_split(df, date_column):
    # Custom time series split based on years
    years = df[date_column].dt.year.unique()
    splits = [(years[5], years[i]) for i in range(10, len(years) - 4)]
    return splits

def get_train_test_split_data(X, y, train_start_date, test_start_date, test_end_date, corr_start_date):
    # Get the training and test data for a specific split
    train_indices = (X[date_column] >= train_start_date) & (X[date_column] < test_start_date)
    test_indices = (X[date_column] >= test_start_date) & (X[date_column] <= test_end_date)
    corr_indices = (X[date_column] >= corr_start_date) & (X[date_column] < test_start_date)

    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    X_corr, y_corr = X[corr_indices], y[corr_indices]

    return (X_train.iloc[:, 1:], y_train, X_test.iloc[:, 1:], y_test, X_corr.iloc[:, 1:], y_corr)

def min_max_data(X_train, X_test):
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    return X_train_scaled, X_test_scaled

def create_lagged_features(df, n_lags):
    lagged_dfs = [df]
    for lag in range(1, n_lags + 1):
        lagged_df = df.shift(lag)
        lagged_df = lagged_df.add_suffix(f'_t-{lag}')
        lagged_dfs.append(lagged_df)

    all_lagged_df = pd.concat(lagged_dfs, axis=1)
    all_lagged_df = all_lagged_df.dropna()
    return all_lagged_df

def create_lagged_features_with_overlap(X_train, y_train, X_test, n_lags=6):
    X_train_lagged = create_lagged_features(X_train, n_lags)
    y_train = y_train.iloc[n_lags:]

    X_test = pd.concat([X_train.iloc[-n_lags:], X_test], ignore_index=True)
    X_test_lagged = create_lagged_features(X_test, n_lags)

    return X_train_lagged.values, y_train, X_test_lagged.values

def convert_to_tensor(X_train, y_train, X_test, y_test):
    return (
        torch.FloatTensor(X_train), torch.FloatTensor(y_train.values), torch.FloatTensor(X_test), torch.FloatTensor(y_test.values)
    )

In [None]:
def get_correlation_df(X_corr, y_corr):
    correlation_scores = []
    for query_column in X_corr.columns:
        correlation = y_corr.corr(X_corr[query_column])
        correlation_scores.append((query_column, correlation))
    return pd.DataFrame(correlation_scores, columns=['Query', 'Correlation'])

def correlation_based_feature_selection(X_corr, y_corr, X_train, X_test, threshold):
    correlation_df = get_correlation_df(X_corr, y_corr).sort_values(by='Correlation', ascending=False).reset_index(drop=True)
    relevant_queries = correlation_df[correlation_df['Correlation'] >= threshold]['Query'].to_list()

    X_train = X_train[relevant_queries]
    X_test = X_test[relevant_queries]

    print("number of features after correlation based fs: ", X_train.shape[1])
    print("X_train: ", X_train.shape, "X_test: ", X_test.shape)

    return X_train, X_test

In [None]:
class MFFNN(nn.Module):
    def __init__(self, input_size):
        super(MFFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 4)
        self.fc2 = nn.Linear(4, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class DFFNN(nn.Module):
    def __init__(self, input_size):
        super(DFFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 25)
        self.fc2 = nn.Linear(25, 25)
        self.fc3 = nn.Linear(25, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def create_ffnn(X_train, seed, model_key):
    torch.manual_seed(seed)
    if model_key == 'MFFNN':
        return MFFNN(X_train.shape[1]).to(device)
    return DFFNN(X_train.shape[1]).to(device)

In [None]:
def train_neural_network(model, X_train, y_train, X_test, y_test, learning_rate, epochs=200):
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=14, shuffle=False)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=14, shuffle=False)

    train_losses = []
    test_losses = []

    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze(1)
            loss = criterion(outputs, targets)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
        train_losses.append(train_loss / len(train_loader))

        # Test phase
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs).squeeze(1)
                test_loss += criterion(outputs, targets).item()
            test_loss /= len(test_loader)
            test_losses.append(test_loss)

    return (model, epoch+1, train_losses, test_losses)


def evaluate_neural_network(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        X_test = X_test.to(device)
        y_test = y_test.to(device)

        y_pred = model(X_test)

        y_pred_cpu = y_pred.cpu().detach().numpy().flatten()
        y_test_cpu = y_test.cpu().numpy().flatten()

        mae = mean_absolute_error(y_test_cpu, y_pred_cpu)
        mape = np.mean(np.abs((y_test_cpu - y_pred_cpu) / y_test_cpu)) * 100
        pearson_corr, _ = pearsonr(y_test_cpu, y_pred_cpu)
        print("MAE: ", mae, "MAPE: ", mape, "P: ", pearson_corr)

    return y_pred_cpu, mae, mape, pearson_corr

In [None]:
def add_average_row(df):
    mae_avg = round(np.mean(df['MAE']), 5)
    pearson_corr_avg = round(np.mean(df['Pearson_Correlation']), 5)
    mape_avg = round(np.mean(df['MAPE']), 5)
    print("MAE AVG: ", mae_avg, "MAPE AVG: ", mape_avg, "P AVG: ", pearson_corr_avg, "\n\n")

    df.loc[len(df)] = {
        'Year': 'Average',
        'MAE': mae_avg,
        'MAPE': mape_avg,
        'Pearson_Correlation': pearson_corr_avg
    }

def run_neural_network(queries, threshold, model_key, lagged=None):
    X = df[[date_column] + queries]
    y = df[ili_rate_column]

    splits = custom_time_series_split(df, date_column)
    
    seed_train_losses = {}
    seed_test_losses = {}

    seed_performances = []
    seed_predictions = []

    seeds = [9904, 5727, 4644, 2955, 3021, 3094, 8349, 5566, 1564, 1723]

    for seed in seeds:
        print("SEED: ", seed)

        model_performance = pd.DataFrame(columns=['Year', 'MAE', 'MAPE', 'Pearson_Correlation'])
        model_predictions = pd.DataFrame(columns=['Date', 'Actual_ILI_Rate', 'Predicted_ILI_Rate'])

        for train_start_year, test_start_year in splits:
            if (test_start_year, test_start_year+1) not in seed_train_losses:
                seed_train_losses[(test_start_year, test_start_year+1)] = []
                seed_test_losses[(test_start_year, test_start_year+1)] = []

            train_start_date = f'{train_start_year}-09-01'
            test_start_date = f'{test_start_year}-09-01'
            test_end_date = f'{test_start_year+1}-08-31'
            corr_start_date = f'{test_start_year-5}-09-01'

            print("train_start_date: ", train_start_date, "test_start_date: ", test_start_date, "test_end_date: ", test_end_date)

            X_train, y_train, X_test, y_test, X_corr, y_corr = get_train_test_split_data(X, y, train_start_date, test_start_date, test_end_date, corr_start_date)
            X_train, X_test = correlation_based_feature_selection(X_corr, y_corr, X_train, X_test, threshold)
            X_train, X_test = min_max_data(X_train, X_test)

            if lagged is not None:
                X_train, y_train, X_test = create_lagged_features_with_overlap(X_train, y_train, X_test, n_lags=lagged)
                X_train, y_train, X_test, y_test = convert_to_tensor(X_train, y_train, X_test, y_test)
            else:
                X_train, y_train, X_test, y_test = convert_to_tensor(X_train.values, y_train, X_test.values, y_test)

            model, epochs, train_losses, test_losses = train_neural_network(create_ffnn(X_train, seed, model_key), X_train, y_train, X_test, y_test, 0.001 if model_key == 'MFFNN' else 0.0001)
            seed_train_losses[(test_start_year, test_start_year+1)].append(train_losses)
            seed_test_losses[(test_start_year, test_start_year+1)].append(test_losses)
            y_pred, mae, mape, pearson_corr = evaluate_neural_network(model, X_test, y_test)

            model_performance.loc[len(model_performance)] = {
                'Year': f'{test_start_year}-{test_start_year+1}',
                'MAE': round(mae, 5),
                'Pearson_Correlation': round(pearson_corr, 5),
                'MAPE': round(mape, 5),
            }

            date_range = pd.date_range(start=f'{test_start_year}-09-01', periods=len(y_pred))
            iteration_predictions = pd.DataFrame({
                'Date': date_range,
                'Predicted_ILI_Rate': y_pred,
                'Actual_ILI_Rate': y_test.cpu().numpy().flatten()
            })
            model_predictions = pd.concat([model_predictions, iteration_predictions])

        add_average_row(model_performance)

        seed_performances.append(model_performance)
        seed_predictions.append(model_predictions)

    avg_performance = pd.concat(seed_performances).groupby(['Year']).agg({'MAE': ['mean', 'std'], 'Pearson_Correlation': ['mean', 'std'], 'MAPE': ['mean', 'std']}).reset_index().round(5)
    avg_predictions = pd.concat(seed_predictions).groupby(['Date']).mean().reset_index()

    for test_start_year, test_end_year in seed_train_losses.keys():
        mean_train_losses = np.mean(np.array(seed_train_losses[(test_start_year, test_end_year)]), axis=0)
        mean_test_losses = np.mean(np.array(seed_test_losses[(test_start_year, test_end_year)]), axis=0)

        plt.figure(figsize=(10, 6))
        plt.plot(mean_train_losses, label='Training Loss')
        plt.plot(mean_test_losses, label='Test Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title(f'Training and Validation Losses {test_start_year}-{test_start_year+1}')
        plt.legend()
        plt.show()

    return avg_performance, avg_predictions

In [None]:
filtered_queries = pd.read_csv('../../sentence_embedding_feature_selection/results/average.csv').iloc[:1000]['Query'].to_list()
model_performance, model_predictions = run_neural_network(filtered_queries, 0.3, 'MFFNN')
print(model_performance)
model_performance.to_csv(f'../../../model_results/nowcasting/neural_network/mffnn/nowcasting_performance.csv')
model_predictions.to_csv(f'../../../model_results/nowcasting/neural_network/mffnn/nowcasting_predictions.csv')

In [None]:
def define_periods(year):
    periods = {
        2014: [
            (datetime(2014, 9, 1), datetime(2014, 12, 1)),  # Onset
            (datetime(2014, 12, 2), datetime(2015, 1, 20)),  # Peak
            (datetime(2015, 1, 21), datetime(2015, 8, 31))  # Tail
        ],
        2015: [
            (datetime(2015, 9, 1), datetime(2015, 12, 30)),  # Onset
            (datetime(2015, 12, 31), datetime(2016, 3, 24)),  # Peak
            (datetime(2016, 3, 25), datetime(2016, 8, 31))  # Tail
        ],
        2016: [
            (datetime(2016, 9, 1), datetime(2016, 12, 1)),  # Onset
            (datetime(2016, 12, 2), datetime(2017, 1, 30)),  # Peak
            (datetime(2017, 1, 31), datetime(2017, 8, 31))  # Tail
        ],
        2017: [
            (datetime(2017, 9, 1), datetime(2017, 12, 30)),  # Onset
            (datetime(2017, 12, 31), datetime(2018, 2, 8)),  # Peak
            (datetime(2018, 2, 9), datetime(2018, 8, 31))  # Tail
        ],
        2018: [
            (datetime(2018, 9, 1), datetime(2018, 12, 28)),  # Onset
            (datetime(2018, 12, 29), datetime(2019, 2, 18)),  # Peak
            (datetime(2019, 2, 19), datetime(2019, 8, 31))  # Tail
        ]
    }

    return periods[year]

def analyse_flu_seasons(data, start_year, end_year):
    season_results = []
    
    for year in range(start_year, end_year):
        season_data = data[(data['Date'] >= datetime(year, 9, 1)) & (data['Date'] <= datetime(year + 1, 8, 31))]
        periods = define_periods(year)
        
        for start, end in periods:
            period_data = season_data[(season_data['Date'] >= start) & (season_data['Date'] <= end)]
            if not period_data.empty:
                y_test = period_data['Actual_ILI_Rate']
                y_pred = period_data['Predicted_ILI_Rate']
                mae = mean_absolute_error(y_test, y_pred)
                period_name = f"{start.strftime('%Y-%m-%d')} - {end.strftime('%Y-%m-%d')}"
                season_results.append({'Flu Season': f'{year}-{year+1}', 'Period': period_name, 'MAE': mae})
    
    return pd.DataFrame(season_results)

model_predictions = pd.read_csv(f'../../../model_results/nowcasting/neural_network/mffnn/nowcasting_predictions.csv')
model_predictions['Date'] = pd.to_datetime(model_predictions['Date'])

period_performances = analyse_flu_seasons(model_predictions, 2014, 2019)
period_performances.to_csv(f'../../../model_results/nowcasting/neural_network/mffnn/nowcasting_period_performance.csv', index=False)