In [1]:
# %cd code
# insert your desired path to work on
import os
from os.path import join
project_path = os.path.dirname(os.getcwd())
# os.chdir(join('..','data'))
os.getcwd()

'C:\\Users\\amanp\\Desktop\\MINOR\\projj\\code'

In [2]:
import sys
sys.path.append(join(project_path, 'code'))

In [3]:
%load_ext autoreload
%autoreload 2

**Plots settings.**

In [4]:
import matplotlib
font = {'family':'Arial', 'size':'15', 'weight':'normal'}

matplotlib.rc('font', **font)

**Set folder structure.**

In [5]:
config = {
    'main_brazil': 'Brazil',
    'main_peru': 'Peru',
    'baseline': join(project_path, "baseline_models"),
    'output': join(project_path, "code", "saved_models"),
    'metrics': join(project_path, "code", "metrics")
}
project_path

# List comprehension for the folder structure code
[os.makedirs(val, exist_ok=True) for key, val in config.items()]

[None, None, None, None, None]

# **AI4Dengue forecasting**
![](https://drive.google.com/uc?export=view&id=1J5Bt5Cks-e2IV-dEJLHJkuwXFJNFAZgr)

In [7]:
import utils
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
from glob import glob
from config import DEP_NAMES, GROUPED_VARS, DATA_REDUCER_SETTINGS, DATA_PROCESSING_SETTINGS


In [7]:
dir(utils)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'clean',
 'cx',
 'geopandas',
 'pd',
 'plist',
 'plotShape']

# Data

## Load the dataframe
**This dataframe comprises all the variables (climatic, epidemiological etc.) acquired for each Department during a defined number of years.**

In [8]:
dataframe = pd.read_csv(join('dataset', "Brazil_UF_dengue_monthly.csv"))
dataframe.head()
dataframe.iloc[1000]

Date                2004-02-01
Year                      2004
Month                        2
CD_UF                       12
area_km2            164173.431
                       ...    
rdpc_def_vulner         119.68
t_analf_18m              17.79
t_formal_18m             46.45
t_fundc_ocup18m           55.2
t_medioc_ocup18m         39.61
Name: 1000, Length: 62, dtype: object

**Load CNN results as columns to dataframe.**

In [9]:
cnn = pd.read_csv(join('saved_models', "cnn_dataframe.csv")).drop('Unnamed: 0', axis=1)
cnn['CD_UF'] = cnn['CD_UF'].astype(np.int64)

assert dataframe.shape[0] == cnn.shape[0]
assert all(dataframe['CD_UF'].unique() == cnn['CD_UF'].unique())
cnn

Unnamed: 0,CD_UF,CNN_all,CNN_0-19
0,11,1.000000,1.000000
1,11,1.000000,1.000000
2,11,1.000000,1.000000
3,11,1.000000,1.000000
4,11,32.159859,17.186546
...,...,...,...
6151,53,29.022461,16.795465
6152,53,20.277210,5.658783
6153,53,7.219064,16.862005
6154,53,17.866333,28.619926


In [10]:
dataframe.sort_values(['CD_UF', 'Date'], inplace=True, ignore_index=True)
dataframe

Unnamed: 0,Date,Year,Month,CD_UF,area_km2,NDVI_d,dewpoint_temperature_2m_d,humidity_d,max_temperature_2m_d,min_temperature_2m_d,...,pea10a14,pea15a17,pea18m,t_eletrica,t_densidadem2,rdpc_def_vulner,t_analf_18m,t_formal_18m,t_fundc_ocup18m,t_medioc_ocup18m
0,2001-01-01,2001,1,11,237765.347,0.154301,295.674980,88.460308,303.987216,294.155015,...,18698,34904,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93
1,2001-02-01,2001,2,11,237765.347,0.216873,295.944060,88.856948,304.738755,294.332566,...,18698,34904,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93
2,2001-03-01,2001,3,11,237765.347,0.239112,296.092747,89.305463,304.620829,294.304126,...,18698,34904,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93
3,2001-04-01,2001,4,11,237765.347,0.334660,296.186143,88.590375,304.168669,293.921815,...,18698,34904,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93
4,2001-05-01,2001,5,11,237765.347,0.378931,295.562972,86.939606,303.903043,293.395959,...,18698,34904,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,2019-08-01,2019,8,53,5760.784,0.362744,282.150351,42.202163,304.210083,287.271135,...,10706,36652,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00
6152,2019-09-01,2019,9,53,5760.784,0.317748,281.820936,34.023500,307.566780,290.719267,...,10706,36652,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00
6153,2019-10-01,2019,10,53,5760.784,0.271795,286.196146,45.486547,307.716003,291.720099,...,10706,36652,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00
6154,2019-11-01,2019,11,53,5760.784,0.235493,290.445969,64.916154,306.706715,291.496597,...,10706,36652,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00


In [11]:
dataframe = pd.concat([dataframe, cnn[['CNN_all', 'CNN_0-19']]], axis=1)
dataframe

Unnamed: 0,Date,Year,Month,CD_UF,area_km2,NDVI_d,dewpoint_temperature_2m_d,humidity_d,max_temperature_2m_d,min_temperature_2m_d,...,pea18m,t_eletrica,t_densidadem2,rdpc_def_vulner,t_analf_18m,t_formal_18m,t_fundc_ocup18m,t_medioc_ocup18m,CNN_all,CNN_0-19
0,2001-01-01,2001,1,11,237765.347,0.154301,295.674980,88.460308,303.987216,294.155015,...,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93,1.000000,1.000000
1,2001-02-01,2001,2,11,237765.347,0.216873,295.944060,88.856948,304.738755,294.332566,...,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93,1.000000,1.000000
2,2001-03-01,2001,3,11,237765.347,0.239112,296.092747,89.305463,304.620829,294.304126,...,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93,1.000000,1.000000
3,2001-04-01,2001,4,11,237765.347,0.334660,296.186143,88.590375,304.168669,293.921815,...,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93,1.000000,1.000000
4,2001-05-01,2001,5,11,237765.347,0.378931,295.562972,86.939606,303.903043,293.395959,...,723839,97.26,27.15,144.93,9.42,51.72,53.83,36.93,32.159859,17.186546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,2019-08-01,2019,8,53,5760.784,0.362744,282.150351,42.202163,304.210083,287.271135,...,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00,29.022461,16.795465
6152,2019-09-01,2019,9,53,5760.784,0.317748,281.820936,34.023500,307.566780,290.719267,...,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00,20.277210,5.658783
6153,2019-10-01,2019,10,53,5760.784,0.271795,286.196146,45.486547,307.716003,291.720099,...,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00,7.219064,16.862005
6154,2019-11-01,2019,11,53,5760.784,0.235493,290.445969,64.916154,306.706715,291.496597,...,1361053,99.91,23.48,171.62,3.66,71.62,76.39,61.00,17.866333,28.619926


**'Clean' the dataset (e.g. remove NaN values)**

In [12]:
dataframe = utils.clean(dataframe)
dataframe.head()

Cleaning dataframe...


Unnamed: 0,Date,Year,Month,CD_UF,area_km2,NDVI_d,dewpoint_temperature_2m_d,humidity_d,max_temperature_2m_d,min_temperature_2m_d,...,t_densidadem2,rdpc_def_vulner,t_analf_18m,t_formal_18m,t_fundc_ocup18m,t_medioc_ocup18m,CNN_all,CNN_0-19,rate_total,rate_019
0,2001-01-01,2001,1,11,237765.347,0.154301,295.67498,88.460308,303.987216,294.155015,...,27.15,144.93,9.42,51.72,53.83,36.93,1.0,1.0,42.75449,29.124122
1,2001-02-01,2001,2,11,237765.347,0.216873,295.94406,88.856948,304.738755,294.332566,...,27.15,144.93,9.42,51.72,53.83,36.93,1.0,1.0,17.601025,11.718582
2,2001-03-01,2001,3,11,237765.347,0.239112,296.092747,89.305463,304.620829,294.304126,...,27.15,144.93,9.42,51.72,53.83,36.93,1.0,1.0,11.072645,6.376287
3,2001-04-01,2001,4,11,237765.347,0.33466,296.186143,88.590375,304.168669,293.921815,...,27.15,144.93,9.42,51.72,53.83,36.93,1.0,1.0,5.120298,3.791306
4,2001-05-01,2001,5,11,237765.347,0.378931,295.562972,86.939606,303.903043,293.395959,...,27.15,144.93,9.42,51.72,53.83,36.93,32.159859,17.186546,6.976406,4.652966


In [13]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6156 entries, 0 to 6155
Data columns (total 61 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       6156 non-null   object 
 1   Year                       6156 non-null   int64  
 2   Month                      6156 non-null   int64  
 3   CD_UF                      6156 non-null   int64  
 4   area_km2                   6156 non-null   float64
 5   NDVI_d                     6156 non-null   float64
 6   dewpoint_temperature_2m_d  6156 non-null   float64
 7   humidity_d                 6156 non-null   float64
 8   max_temperature_2m_d       6156 non-null   float64
 9   min_temperature_2m_d       6156 non-null   float64
 10  surface_pressure_d         6156 non-null   float64
 11  temperature_2m_d           6156 non-null   float64
 12  total_precipitation_d      6156 non-null   float64
 13  u_component_of_wind_10m_d  6156 non-null   float

## Apply Data Reduction
**Data reduction is applied to three macro groups in order to reduce the number of variables on which the AI framework will be trained. The variables belonging to each group are set with the *PCAgroups* dictionary. The groups are:**
1. ***CLIMATIC VARIABLES***,
2. ***GEO VARIABLES***,
3. ***SOCIO VARIABLES***

In [14]:
print('\033[1m PCA Excluded Variables \033[0m')
utils.plist(GROUPED_VARS['EXCLUDED'])

print('\033[1m Climatic variables \033[0m')
utils.plist(GROUPED_VARS['CLIMATIC VARIABLES'])

print('\033[1m Geo variables \033[0m')
utils.plist(GROUPED_VARS['GEO VARIABLES'])

print('\033[1m Socio variables \033[0m')
utils.plist(GROUPED_VARS['SOCIO VARIABLES'])

print('\033[1m Additional variables \033[0m')
utils.plist(GROUPED_VARS['AUXILIAR'])

print('\033[1m Dengue variables \033[0m')
utils.plist(GROUPED_VARS['DENGUE'])

[1m PCA Excluded Variables [0m
----- 1 t_fundc_ocup18m
----- 2 t_medioc_ocup18m
----- 3 PopTotal_Urban_UF
----- 4 PopTotal_Rural_UF
----- 5 total_precipitation_d
----- 6 surface_pressure_d
----- 7 area_km2
----- 8 humidity_d
----- 9 temperature_2m_d
----- 10 min_temperature_2m_d
----- 11 CNN_all
----- 12 CNN_0-19
[1m Climatic variables [0m
----- 1 dewpoint_temperature_2m_d
----- 2 max_temperature_2m_d
----- 3 u_component_of_wind_10m_d
----- 4 v_component_of_wind_10m_d
[1m Geo variables [0m
----- 1 NDVI_d
----- 2 max_elevation_d
----- 3 mean_elevation_d
----- 4 min_elevation_d
----- 5 stdDev_elevation_d
----- 6 variance_elevation_d
----- 7 Forest_Cover_Percent
----- 8 Urban_Cover_Percent
[1m Socio variables [0m
----- 1 Urban_Cover_Percent
----- 2 ivs
----- 3 ivs_infraestrutura_urbana
----- 4 ivs_capital_humano
----- 5 ivs_renda_e_trabalho
----- 6 t_sem_agua_esgoto
----- 7 t_sem_lixo
----- 8 t_vulner_mais1h
----- 9 t_analf_15m
----- 10 t_cdom_fundin
----- 11 t_p15a24_nada
----- 1

**We selected two types of data reduction methods: PCA (Principal Component Analysis) and PLS (Principal Least Square). The second one is the default solution because it reduces the input data by considering also a second variable that in our case is the Dengue Incidence Rates.**

In [15]:
from data_reduction import pca_reducer, pls_reducer

In [None]:
scaler = MinMaxScaler()

**Extract climatic, geophysical and socio-economic variables from the dataframe**

In [16]:
X_climatic = scaler.fit_transform(dataframe[GROUPED_VARS['CLIMATIC VARIABLES']].values)
X_geo = scaler.fit_transform(dataframe[GROUPED_VARS['GEO VARIABLES']].values)
X_socio = scaler.fit_transform(dataframe[GROUPED_VARS['SOCIO VARIABLES']].values)


**Extract Dengue variables from the dataframe, apply a root scaling and normalization**

In [17]:
y_dengue = dataframe[GROUPED_VARS['DENGUE']].values
scaler = MinMaxScaler()
y_dengue = scaler.fit_transform(y_dengue)

**Apply data reduction technique**

In [18]:
if DATA_REDUCER_SETTINGS['TYPE'] == 'PLS':
    climatic_vars_reduced = pls_reducer(
        X_climatic, y_dengue, 
        DATA_REDUCER_SETTINGS['NUMBER OF COMPONENTS']['CLIMATIC VARIABLES']
    )
    geo_vars_reduced = pls_reducer(
        X_geo, y_dengue, 
        DATA_REDUCER_SETTINGS['NUMBER OF COMPONENTS']['GEO VARIABLES']
    )
    socio_vars_reduced = pls_reducer(
        X_socio, y_dengue, 
        DATA_REDUCER_SETTINGS['NUMBER OF COMPONENTS']['SOCIO VARIABLES']
    )
    print(socio_vars_reduced.shape)
elif DATA_REDUCER_SETTINGS['TYPE'] == 'PCA':
    climatic_vars_reduced = pca_reducer(
        X_climatic, 
        DATA_REDUCER_SETTINGS['NUMBER OF COMPONENTS']['CLIMATIC VARIABLES']
    )
    geo_vars_reduced = pca_reducer(
        X_geo, 
        DATA_REDUCER_SETTINGS['NUMBER OF COMPONENTS']['GEO VARIABLES']
    )
    socio_vars_reduced = pca_reducer(
        X_socio, 
        DATA_REDUCER_SETTINGS['NUMBER OF COMPONENTS']['SOCIO VARIABLES']
    )
else:
    print('No data reduction.')
    climatic_vars_reduced, geo_vars_reduced, socio_vars_reduced = X_climatic, X_geo, X_socio

(6156, 10)


## Order reduced data in a new dataframe


**Normalize remaining variables**

In [19]:
x_excluded = dataframe[GROUPED_VARS['EXCLUDED']].values
x_excluded = MinMaxScaler().fit_transform(x_excluded)

In [20]:
X_auxiliar = dataframe[GROUPED_VARS['AUXILIAR']].values
X_auxiliar = MinMaxScaler().fit_transform(X_auxiliar)

**Create a new database with the reduced, the auxiliar and Dengue variables**

In [21]:
independent = {'Year':dataframe.Year.values, 'dep_id':dataframe.CD_UF.values, 't_fundc_ocup18m':x_excluded[:, 0], 't_medioc_ocup18m':x_excluded[:, 1],
               'PopTotal_Urban_UF':x_excluded[:, 2], 'PopTotal_Rural_UF':x_excluded[:, 3], 'total_precipitation_d':x_excluded[:, 4],
               'surface_pressure_d':x_excluded[:, 5], 'area_km2':x_excluded[:, 6], 'humidity_d':x_excluded[:, 7], 'temperature_2m_d':x_excluded[:, 8],
               'min_temperature_2m_d':x_excluded[:, 9], 'CNN_all':x_excluded[:, 10], 'CNN_0-19':x_excluded[:, 11]}

auxiliar    = {'Month': X_auxiliar[:, 0],
               'cases20_99': X_auxiliar[:, 1], 'cases0_19': X_auxiliar[:, 2],
               'RandEffects1':  MinMaxScaler().fit_transform(np.reshape(dataframe.CD_UF.values*dataframe.Month.values, (dataframe.CD_UF.values.shape[0], 1)))[:,0],
               'RandEffects2':  MinMaxScaler().fit_transform(np.reshape(dataframe.CD_UF.values*dataframe.Year.values, (dataframe.CD_UF.values.shape[0], 1)))[:,0],
               'RandEffects3':  MinMaxScaler().fit_transform(np.reshape(dataframe.CD_UF.values*dataframe.Month.values*dataframe.Year.values, (dataframe.CD_UF.values.shape[0], 1)))[:,0]}

climatic    = {'PCA0-Climatic':climatic_vars_reduced[:,0], 'PCA1-Climatic':climatic_vars_reduced[:,1], 'PCA2-Climatic':climatic_vars_reduced[:,2],
               'PCA3-Climatic':climatic_vars_reduced[:,3]}

geo         = {'PCA0-Geo':geo_vars_reduced[:,0], 'PCA1-Geo':geo_vars_reduced[:,1], 'PCA2-Geo':geo_vars_reduced[:,2],
               'PCA3-Geo':geo_vars_reduced[:,3], 'PCA4-Geo':geo_vars_reduced[:,4], 'PCA5-Geo':geo_vars_reduced[:,5]}

socio       = {'PCA0-Socio':socio_vars_reduced[:,0], 'PCA1-Socio':socio_vars_reduced[:,1], 'PCA2-Socio':socio_vars_reduced[:,2],
               'PCA3-Socio':socio_vars_reduced[:,3], 'PCA4-Socio':socio_vars_reduced[:,4], 'PCA5-Socio':socio_vars_reduced[:,5]}

dengue      = {'DengRate_all': y_dengue[:,0], 'DengRate_019': y_dengue[:,1]}

columns     = {**independent, **auxiliar, **climatic, **geo, **socio, **dengue}

reduced_dataframe = pd.DataFrame(columns)
reduced_dataframe.head()
reduced_dataframe.columns

# reduced_dataframe

Index(['Year', 'dep_id', 't_fundc_ocup18m', 't_medioc_ocup18m',
       'PopTotal_Urban_UF', 'PopTotal_Rural_UF', 'total_precipitation_d',
       'surface_pressure_d', 'area_km2', 'humidity_d', 'temperature_2m_d',
       'min_temperature_2m_d', 'CNN_all', 'CNN_0-19', 'Month', 'cases20_99',
       'cases0_19', 'RandEffects1', 'RandEffects2', 'RandEffects3',
       'PCA0-Climatic', 'PCA1-Climatic', 'PCA2-Climatic', 'PCA3-Climatic',
       'PCA0-Geo', 'PCA1-Geo', 'PCA2-Geo', 'PCA3-Geo', 'PCA4-Geo', 'PCA5-Geo',
       'PCA0-Socio', 'PCA1-Socio', 'PCA2-Socio', 'PCA3-Socio', 'PCA4-Socio',
       'PCA5-Socio', 'DengRate_all', 'DengRate_019'],
      dtype='object')

In [22]:
reduced_dataframe=reduced_dataframe[reduced_dataframe['Year']>=2004]
reduced_dataframe.to_csv('reduced_2004_2019.csv', index=False)
reduced_dataframe.shape

(5184, 38)

## Create training and validation data
**First of all, the dataframe is divided in two sub-dataframes (training and validation) by using the variable *Year***

In [23]:
dataset_path = os.path.join("..", "google trends", "merged_dataset.csv")
reduced_dataframe = pd.read_csv(dataset_path)
# Print shape and column names
print("Dataset Shape:", reduced_dataframe.shape)
print("Columns:", reduced_dataframe.columns)

Dataset Shape: (5184, 41)
Columns: Index(['Year', 'dep_id', 't_fundc_ocup18m', 't_medioc_ocup18m',
       'PopTotal_Urban_UF', 'PopTotal_Rural_UF', 'total_precipitation_d',
       'surface_pressure_d', 'area_km2', 'humidity_d', 'temperature_2m_d',
       'min_temperature_2m_d', 'CNN_all', 'CNN_0-19', 'Month', 'cases20_99',
       'cases0_19', 'RandEffects1', 'RandEffects2', 'RandEffects3',
       'PCA0-Climatic', 'PCA1-Climatic', 'PCA2-Climatic', 'PCA3-Climatic',
       'PCA0-Geo', 'PCA1-Geo', 'PCA2-Geo', 'PCA3-Geo', 'PCA4-Geo', 'PCA5-Geo',
       'PCA0-Socio', 'PCA1-Socio', 'PCA2-Socio', 'PCA3-Socio', 'PCA4-Socio',
       'PCA5-Socio', 'mosquito_interest', 'sintomas_dengue_interest',
       'dengue_interest', 'DengRate_all', 'DengRate_019'],
      dtype='object')


In [24]:
training_dataframe = reduced_dataframe[reduced_dataframe.Year < 2017].copy()
validation_dataframe = reduced_dataframe[reduced_dataframe.Year >= 2017].copy()
print(f"Training Size: {len(training_dataframe)}, Validation Size: {len(validation_dataframe)}")

# Check for missing values
print("Missing values in training set:", training_dataframe.isnull().sum().sum())
print("Missing values in validation set:", validation_dataframe.isnull().sum().sum())
training_dataframe.head()
validation_dataframe.head()

Training Size: 4212, Validation Size: 972
Missing values in training set: 0
Missing values in validation set: 0


Unnamed: 0,Year,dep_id,t_fundc_ocup18m,t_medioc_ocup18m,PopTotal_Urban_UF,PopTotal_Rural_UF,total_precipitation_d,surface_pressure_d,area_km2,humidity_d,...,PCA1-Socio,PCA2-Socio,PCA3-Socio,PCA4-Socio,PCA5-Socio,mosquito_interest,sintomas_dengue_interest,dengue_interest,DengRate_all,DengRate_019
156,2017,11,0.206751,0.12853,0.020497,0.089637,0.470017,0.816201,0.149352,0.93308,...,0.096634,0.179967,0.325246,0.604686,0.076292,0.11,0.24,0.14,0.032628,0.031062
157,2017,11,0.206751,0.12853,0.020497,0.089637,0.455807,0.813839,0.149352,0.957511,...,0.096634,0.179967,0.325246,0.604686,0.076292,0.14,0.35,0.14,0.028203,0.036438
158,2017,11,0.206751,0.12853,0.020497,0.089637,0.409449,0.81587,0.149352,0.945,...,0.096634,0.179967,0.325246,0.604686,0.076292,0.26,0.22,0.19,0.020461,0.02031
159,2017,11,0.206751,0.12853,0.020497,0.089637,0.239287,0.822995,0.149352,0.947705,...,0.096634,0.179967,0.325246,0.604686,0.076292,0.15,0.15,0.18,0.008185,0.007766
160,2017,11,0.206751,0.12853,0.020497,0.089637,0.123361,0.827288,0.149352,0.935646,...,0.096634,0.179967,0.325246,0.604686,0.076292,0.12,0.1,0.1,0.005032,0.006372


In [25]:
# training_dataframe["DengRate_all_log"] = np.log1p(training_dataframe["DengRate_all"])
# validation_dataframe["DengRate_all_log"] = np.log1p(validation_dataframe["DengRate_all"])

In [26]:
# import matplotlib.pyplot as plt

# # Replace 'rate_total' with 'DengRate_all' (or 'DengRate_019' if needed)
# plt.hist(training_dataframe["DengRate_all"], bins=30, alpha=0.5, label="Train")
# plt.hist(validation_dataframe["DengRate_all"], bins=30, alpha=0.5, label="Validation")
# plt.legend()
# plt.show()

**Then the dataset handler is initialized. This object will handle all the operations needed to create, reshape and augment the training and validation dataset to fit the requirements of each Deep Learning or Machine Learning model.**

In [27]:
from datasetHandler import datasetHandler


In [28]:
# # Drop excluded columns before feeding into DatasetHandler
# excluded_columns = GROUPED_VARS["EXCLUDED"]
# training_dataframe_filtered = training_dataframe.drop(columns=excluded_columns, errors="ignore")
# validation_dataframe_filtered = validation_dataframe.drop(columns=excluded_columns, errors="ignore")

In [29]:
# Initialize dataset handler with filtered data
dataset_handler = datasetHandler(training_dataframe, validation_dataframe)

In [30]:
# Fetch training/validation data & indices
x_train, y_train, x_val, y_val, train_indices, val_indices = dataset_handler.get_data(
    DATA_PROCESSING_SETTINGS["T LEARNING"], DATA_PROCESSING_SETTINGS["T PREDICTION"]
)

# Print shapes
print("\nX Training shape:", x_train.shape)
print("Y Training shape:", y_train.shape)
print("X Validation shape:", x_val.shape)
print("Y Validation shape:", y_val.shape)

X Training shape: (3888, 12, 41)
Y Training shape: (3888, 2)
X Validation shape: (648, 12, 41)
Y Validation shape: (648, 2)

X Training shape: (3888, 12, 41)
Y Training shape: (3888, 2)
X Validation shape: (648, 12, 41)
Y Validation shape: (648, 2)


**Apply data augmention**

In [31]:
# Apply augmentation with dynamic noise scaling
x_train_a, y_train_a, x_val_a, y_val_a = dataset_handler.augment(
    x_train, y_train, x_val, y_val, DATA_PROCESSING_SETTINGS["AUGMENTATION"]
)

# Print shapes
print("X Training Augmented shape:", x_train_a.shape)
print("Y Training Augmented shape:", y_train_a.shape)
print("X Validation Augmented shape:", x_val_a.shape)
print("Y Validation Augmented shape:", y_val_a.shape)


X Training Augmented shape: (11664, 12, 41)
Y Training Augmented shape: (11664, 2)
X Validation Augmented shape: (1944, 12, 41)
Y Validation Augmented shape: (1944, 2)


# TCN

In [32]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, BatchNormalization, Dropout, Input, Activation
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.metrics import root_mean_squared_error
from keras.metrics import MeanSquaredError, MeanAbsoluteError
from keras_tuner import HyperModel, RandomSearch
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from datetime import datetime
from glob import glob
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, average_precision_score

In [39]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Even though ExponentialDecay is imported, it is not actually used in this code.
from tensorflow.keras.optimizers.schedules import ExponentialDecay  
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K



LSTM_SETTINGS = {
    'EPOCHS': 200,
    'LEARNING RATE': 0.0001,
    'BATCH SIZE': 16,
    'EARLY STOPPING': 24,
    'DROPOUT_RATE': 0.3,
    'L2_REGULARIZATION': 1e-4,
    'NUM_RESIDUAL_BLOCKS': 4,
    'NUM_FILTERS': 64
}

def build_tcn_model_v2(input_shape, output_units, num_filters, dropout_rate, l2_reg, num_residual_blocks):
    def residual_block(x, filters, dilation_rate):
        shortcut = x
        x = layers.Conv1D(filters, 3, padding='causal', activation='relu',
                         dilation_rate=dilation_rate, kernel_regularizer=regularizers.l2(l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.add([x, shortcut])
        return layers.ReLU()(x)

    inputs = layers.Input(shape=input_shape)
    x = layers.Conv1D(num_filters, 3, padding='causal', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout_rate * 0.5)(x)
    
    for dilation in [1, 2, 4, 8][:num_residual_blocks]:
        x = residual_block(x, num_filters, dilation)
        
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(output_units, activation='linear')(x)
    return models.Model(inputs, outputs)

    
class ImprovedTCNNet:
    def __init__(self, shape, output_units=2, num_filters=LSTM_SETTINGS['NUM_FILTERS'],
                 dropout_rate=LSTM_SETTINGS['DROPOUT_RATE'], l2_regularization=LSTM_SETTINGS['L2_REGULARIZATION'],
                 num_residual_blocks=LSTM_SETTINGS['NUM_RESIDUAL_BLOCKS']):
        self.shape = shape
        self.epochs = LSTM_SETTINGS['EPOCHS']
        self.batch_size = LSTM_SETTINGS['BATCH SIZE']
        self.lr = LSTM_SETTINGS['LEARNING RATE']
        self.early_stopping_rounds = LSTM_SETTINGS['EARLY STOPPING']
        self.dropout_rate = dropout_rate
        self.l2_regularization = l2_regularization
        self.num_filters = num_filters
        self.num_residual_blocks = num_residual_blocks
        
        # Build the uncompiled model
        self.model = build_tcn_model_v2(
            self.shape, output_units, self.num_filters,
            self.dropout_rate, self.l2_regularization, self.num_residual_blocks
        )
        
        # Compile the model
        self.model.compile(
            optimizer=Adam(learning_rate=self.lr),
            loss='Huber',
            metrics=['mae']
        )
    
    def load(self, model_path):
        """
        Load a saved model from the specified path.
        """
        self.model = tf.keras.models.load_model(model_path)
        print(f"Model loaded successfully from {model_path}")



    
    def train(self, training, validation, output_path):
        """Train the TCN model with early stopping and learning rate reduction."""
        # Early Stopping
        es = EarlyStopping(
            monitor='val_loss',
            patience=self.early_stopping_rounds,
            restore_best_weights=True
        )
        
        # Reduce LR on Plateau
        lr_scheduler = ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=1
        )
        
        # Training
        history = self.model.fit(
            x=training[0],
            y=training[1],
            validation_data=(validation[0], validation[1]),
            epochs=self.epochs,
            batch_size=self.batch_size,
            callbacks=[es, lr_scheduler],
            shuffle=True
        )

        # Plot the training history
        plt.figure(figsize=(8, 6))
        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.legend()
        plt.title('Improved TCN Model Loss Curve')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.show()

        # Save the model
        today = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
        model_filename = f"TCN-new-search-{today}.keras"
        os.makedirs(output_path, exist_ok=True)
        save_path = os.path.join(output_path, model_filename)
        self.model.save(save_path)
        print(f"Model saved to {save_path}")

        return history

In [40]:
import shap
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [44]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from glob import glob  # Needed if you use glob(...) below

# Sklearn metrics
from sklearn.metrics import (mean_squared_error, mean_absolute_error,
                             mean_absolute_percentage_error, r2_score,
                             roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay)

output_path = os.path.join(config['output'], "Brazil")
os.makedirs(output_path, exist_ok=True)

# Define training flag
TRAINING = True  # Set to False to skip training and load an existing model

# Utility functions
def calculate_nrmse(true_values, predicted_values):
    # Replaces the undefined root_mean_squared_error
    # with a direct sqrt(mean_squared_error(...))
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    return rmse / (true_values.max() - true_values.min())

def calculate_mae(true_values, predicted_values):
    return mean_absolute_error(true_values, predicted_values)

def calculate_mse(true_values, predicted_values):
    return mean_squared_error(true_values, predicted_values)

def calculate_rmse(mse):
    return mse ** 0.5

def calculate_mape(true_values, predicted_values):
    return mean_absolute_percentage_error(true_values, predicted_values)

def calculate_r2(true_values, predicted_values):
    return r2_score(true_values, predicted_values)

# Discretize continuous values into binary for confusion matrix
def discretize_to_binary(values, threshold=0.5):
    return np.where(values > threshold, 1, 0)

# Function to plot a confusion matrix
def plot_confusion_matrix(true_values, predicted_values, department_name, metric_type="All"):
    cm = confusion_matrix(true_values, predicted_values)
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(ax=ax, cmap='Blues', values_format='d')
    ax.set_title(f'Confusion Matrix: {department_name} - {metric_type}')
    plt.show()

if TRAINING:
    print("Training the model...")
    # Make sure you specify output_path with os.path.join
    print("Preparing and Training the model...")
    trainingT, validationT = dataset_handler.prepare_data_LSTM(x_train, y_train, x_val, y_val)
    # **Dynamically determine input shape from the prepared data:**
    input_shape_dynamic = trainingT[0].shape[1:]
    print(f"Dynamically determined input shape from data: {input_shape_dynamic}")

    hyperparameter_settings = {
        'DROPOUT_RATE': 0.4,       # Example: Increased dropout
        'L2_REGULARIZATION': 1e-4,  # Example: Increased L2 regularization
        'NUM_RESIDUAL_BLOCKS': 3,   # Example: Reduced residual blocks
        'NUM_FILTERS': 64          # Example: Reduced filters
    }
    
    tcn = ImprovedTCNNet(
        shape=input_shape_dynamic,
        dropout_rate=hyperparameter_settings.get('DROPOUT_RATE', LSTM_SETTINGS['DROPOUT_RATE']),
        l2_regularization=hyperparameter_settings.get('L2_REGULARIZATION', LSTM_SETTINGS['L2_REGULARIZATION']),
        num_filters=hyperparameter_settings.get('NUM_FILTERS', LSTM_SETTINGS['NUM_FILTERS']),
        num_residual_blocks=hyperparameter_settings.get('NUM_RESIDUAL_BLOCKS', LSTM_SETTINGS['NUM_RESIDUAL_BLOCKS'])
    )
    history = tcn.train(training=trainingT,
                        validation=validationT,
                        output_path=os.path.join(config['output'], "Brazil"))
else:
    print("Checking for saved models...")
    tcn_models = glob(os.path.join(output_path, "TCN-new-search-*.keras"))
    if not tcn_models:
        print('No file with such pattern was found in the directory. Run TRAINING = True first.')
        exit()
    else:
        # Load the most recent TCN model
        tcn = ImprovedTCNNet((None,))  # Dummy shape, will be replaced by loaded model
        tcn.load(tcn_models[-1])
        
        print(f"Loading model from: {tcn_models[-1]}")

    # Re-prepare the data for inference
    (trainingT_X, trainingT_Y), (validationT_X, validationT_Y) = dataset_handler.prepare_data_LSTM(x_train, y_train, x_val, y_val)

    trainT, valT = dataset_handler.prepare_data_LSTM(x_train[:,:,2:], y_train,
                                                     x_val[:,:,2:], y_val)

    # Suppose val_indices, train_indices, scaler, etc. are also defined
    y_val_indices_df = pd.DataFrame(val_indices, columns=['actual_index'])
    y_train_indices_df = pd.DataFrame(train_indices, columns=['actual_index'])

    # Make predictions
    preds_tra = tcn.model.predict(trainT[0])
    preds_tra[preds_tra < 0] = 0

    preds_val = tcn.model.predict(valT[0])
    preds_val[preds_val < 0] = 0

    # Inverse transform predictions and ground truth if using scaling
    preds_val_original = scaler.inverse_transform(preds_val)
    y_val_original = scaler.inverse_transform(valT[1])
    preds_tra_original = scaler.inverse_transform(preds_tra)
    y_train_original = scaler.inverse_transform(trainT[1])

    # Collect results by department
    results = []

    for department_idx, department_name in DEP_NAMES.items():
        department_rows_val = validation_dataframe[validation_dataframe['dep_id'] == department_idx]
        department_rows_train = training_dataframe[training_dataframe['dep_id'] == department_idx]

        if department_rows_val.empty or department_rows_train.empty:
            continue

        department_indices_val = department_rows_val.index.tolist()
        department_indices_train = department_rows_train.index.tolist()

        matching_indices_val = y_val_indices_df[y_val_indices_df['actual_index'].isin(department_indices_val)].index
        matching_indices_train = y_train_indices_df[y_train_indices_df['actual_index'].isin(department_indices_train)].index

        if matching_indices_val.empty or matching_indices_train.empty:
            continue

        # Split into DengRate_all (col 0) and DengRate_019 (col 1)
        true_dengrate_all_val = y_val_original[matching_indices_val, 0]
        true_dengrate_019_val = y_val_original[matching_indices_val, 1]
        predicted_dengrate_all_val = preds_val_original[matching_indices_val, 0]
        predicted_dengrate_019_val = preds_val_original[matching_indices_val, 1]

        true_dengrate_all_train = y_train_original[matching_indices_train, 0]
        true_dengrate_019_train = y_train_original[matching_indices_train, 1]
        predicted_dengrate_all_train = preds_tra_original[matching_indices_train, 0]
        predicted_dengrate_019_train = preds_tra_original[matching_indices_train, 1]

        # Calculate metrics for DengRate_all
        mse_dengrate_all_val = calculate_mse(true_dengrate_all_val, predicted_dengrate_all_val)
        mse_dengrate_all_train = calculate_mse(true_dengrate_all_train, predicted_dengrate_all_train)

        rmse_dengrate_all_val = calculate_rmse(mse_dengrate_all_val)
        rmse_dengrate_all_train = calculate_rmse(mse_dengrate_all_train)

        mae_dengrate_all_val = calculate_mae(true_dengrate_all_val, predicted_dengrate_all_val)
        mae_dengrate_all_train = calculate_mae(true_dengrate_all_train, predicted_dengrate_all_train)

        mape_dengrate_all_val = calculate_mape(true_dengrate_all_val, predicted_dengrate_all_val)
        mape_dengrate_all_train = calculate_mape(true_dengrate_all_train, predicted_dengrate_all_train)

        r2_dengrate_all_val = calculate_r2(true_dengrate_all_val, predicted_dengrate_all_val)
        r2_dengrate_all_train = calculate_r2(true_dengrate_all_train, predicted_dengrate_all_train)

        # Calculate metrics for DengRate_019
        mse_dengrate_019_val = calculate_mse(true_dengrate_019_val, predicted_dengrate_019_val)
        mse_dengrate_019_train = calculate_mse(true_dengrate_019_train, predicted_dengrate_019_train)

        rmse_dengrate_019_val = calculate_rmse(mse_dengrate_019_val)
        rmse_dengrate_019_train = calculate_rmse(mse_dengrate_019_train)

        mae_dengrate_019_val = calculate_mae(true_dengrate_019_val, predicted_dengrate_019_val)
        mae_dengrate_019_train = calculate_mae(true_dengrate_019_train, predicted_dengrate_019_train)

        mape_dengrate_019_val = calculate_mape(true_dengrate_019_val, predicted_dengrate_019_val)
        mape_dengrate_019_train = calculate_mape(true_dengrate_019_train, predicted_dengrate_019_train)

        r2_dengrate_019_val = calculate_r2(true_dengrate_019_val, predicted_dengrate_019_val)
        r2_dengrate_019_train = calculate_r2(true_dengrate_019_train, predicted_dengrate_019_train)

        # Discretize predictions for confusion matrix (if you want binary classification)
        true_dengrate_all_val_bin = discretize_to_binary(true_dengrate_all_val)
        predicted_dengrate_all_val_bin = discretize_to_binary(predicted_dengrate_all_val)

        true_dengrate_all_train_bin = discretize_to_binary(true_dengrate_all_train)
        predicted_dengrate_all_train_bin = discretize_to_binary(predicted_dengrate_all_train)

        # Append results for this department
        results.append({
            'Department': department_name,
            'MAE (DengRate_all) Val': mae_dengrate_all_val,
            'RMSE (DengRate_all) Val': rmse_dengrate_all_val,
            'MAPE (DengRate_all) Val': mape_dengrate_all_val,
            'R2 (DengRate_all) Val': r2_dengrate_all_val,
            'MSE (DengRate_all) Val': mse_dengrate_all_val,

            'MAE (DengRate_all) Train': mae_dengrate_all_train,
            'RMSE (DengRate_all) Train': rmse_dengrate_all_train,
            'MAPE (DengRate_all) Train': mape_dengrate_all_train,
            'R2 (DengRate_all) Train': r2_dengrate_all_train,
            'MSE (DengRate_all) Train': mse_dengrate_all_train,

            'MAE (DengRate_019) Val': mae_dengrate_019_val,
            'RMSE (DengRate_019) Val': rmse_dengrate_019_val,
            'MAPE (DengRate_019) Val': mape_dengrate_019_val,
            'R2 (DengRate_019) Val': r2_dengrate_019_val,
            'MSE (DengRate_019) Val': mse_dengrate_019_val,

            'MAE (DengRate_019) Train': mae_dengrate_019_train,
            'RMSE (DengRate_019) Train': rmse_dengrate_019_train,
            'MAPE (DengRate_019) Train': mape_dengrate_019_train,
            'R2 (DengRate_019) Train': r2_dengrate_019_train,
            'MSE (DengRate_019) Train': mse_dengrate_019_train,
        })

        # Create or update DataFrame for metrics
        results_df = pd.DataFrame(results)
        today = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
        out_csv = os.path.join(config['metrics'], "Brazil",
                               f'TCN_new_model_search_{today}.csv')
        os.makedirs(os.path.dirname(out_csv), exist_ok=True)
        results_df.to_csv(out_csv, index=False)
        print(f"Results saved to {out_csv}")

      

Checking for saved models...


ValueError: Input 0 of layer "conv1d_5" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (None, None)

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from glob import glob  # Needed if you use glob(...) below

# Sklearn metrics
from sklearn.metrics import (mean_squared_error, mean_absolute_error,
                             mean_absolute_percentage_error, r2_score,
                             roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay)

# Example config usage
# config = {
#     'output': './some_output_dir',
#     'metrics': './some_metrics_dir'
# }
# DEP_NAMES, scaler, training_dataframe, validation_dataframe, etc. assumed defined

# Define output path
output_path = os.path.join(config['output'], "Brazil")
os.makedirs(output_path, exist_ok=True)

# Define training flag
TRAINING = True  # Set to False to skip training and load an existing model

# Utility functions
def calculate_nrmse(true_values, predicted_values):
    # Replaces the undefined root_mean_squared_error
    # with a direct sqrt(mean_squared_error(...))
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    return rmse / (true_values.max() - true_values.min())

def calculate_mae(true_values, predicted_values):
    return mean_absolute_error(true_values, predicted_values)

def calculate_mse(true_values, predicted_values):
    return mean_squared_error(true_values, predicted_values)

def calculate_rmse(mse):
    return mse ** 0.5

def calculate_mape(true_values, predicted_values):
    return mean_absolute_percentage_error(true_values, predicted_values)

def calculate_r2(true_values, predicted_values):
    return r2_score(true_values, predicted_values)

# Discretize continuous values into binary for confusion matrix
def discretize_to_binary(values, threshold=0.5):
    return np.where(values > threshold, 1, 0)

# Function to plot a confusion matrix
def plot_confusion_matrix(true_values, predicted_values, department_name, metric_type="All"):
    cm = confusion_matrix(true_values, predicted_values)
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(ax=ax, cmap='Blues', values_format='d')
    ax.set_title(f'Confusion Matrix: {department_name} - {metric_type}')
    plt.show()

# ------------------------------------------------------------------------------
# TCN object creation, data prep, etc. assumed done above (for example):
#    trainingT, validationT = dataset_handler.prepare_data_LSTM(...)
#    tcn = ImprovedTCNNet(trainingT[0].shape[1:])
# ------------------------------------------------------------------------------
# print(f"Value of train_model: {train_model}") # Add this line
if TRAINING:
    print("Training the model...")
    # Make sure you specify output_path with os.path.join
    print("Preparing and Training the model...")
    trainingT, validationT = dataset_handler.prepare_data_LSTM(x_train, y_train, x_val, y_val)
    # **Dynamically determine input shape from the prepared data:**
    input_shape_dynamic = trainingT[0].shape[1:]
    print(f"Dynamically determined input shape from data: {input_shape_dynamic}")

    hyperparameter_settings = {
        'DROPOUT_RATE': 0.4,       # Example: Increased dropout
        'L2_REGULARIZATION': 1e-4,  # Example: Increased L2 regularization
        'NUM_RESIDUAL_BLOCKS': 3,   # Example: Reduced residual blocks
        'NUM_FILTERS': 64          # Example: Reduced filters
    }
    
    tcn = ImprovedTCNNet(
        shape=input_shape_dynamic,
        dropout_rate=hyperparameter_settings.get('DROPOUT_RATE', LSTM_SETTINGS['DROPOUT_RATE']),
        l2_regularization=hyperparameter_settings.get('L2_REGULARIZATION', LSTM_SETTINGS['L2_REGULARIZATION']),
        num_filters=hyperparameter_settings.get('NUM_FILTERS', LSTM_SETTINGS['NUM_FILTERS']),
        num_residual_blocks=hyperparameter_settings.get('NUM_RESIDUAL_BLOCKS', LSTM_SETTINGS['NUM_RESIDUAL_BLOCKS'])
    )
    history = tcn.train(training=trainingT,
                        validation=validationT,
                        output_path=os.path.join(config['output'], "Brazil"))
else:
    print("Checking for saved models...")
    tcn_models = glob(os.path.join(output_path, "TCN-new-search-*.keras"))
    if not tcn_models:
        print('No file with such pattern was found in the directory. Run TRAINING = True first.')
        exit()
    else:
        # Load the most recent TCN model
        tcn = ImprovedTCNNet((None,))  # Dummy shape, will be replaced by loaded model
        tcn.load(tcn_models[-1])
        
        print(f"Loading model from: {tcn_models[-1]}")

    # Re-prepare the data for inference
    (trainingT_X, trainingT_Y), (validationT_X, validationT_Y) = dataset_handler.prepare_data_LSTM(x_train, y_train, x_val, y_val)

    trainT, valT = dataset_handler.prepare_data_LSTM(x_train[:,:,2:], y_train,
                                                     x_val[:,:,2:], y_val)

    # Suppose val_indices, train_indices, scaler, etc. are also defined
    y_val_indices_df = pd.DataFrame(val_indices, columns=['actual_index'])
    y_train_indices_df = pd.DataFrame(train_indices, columns=['actual_index'])

    # Make predictions
    preds_tra = tcn.model.predict(trainT[0])
    preds_tra[preds_tra < 0] = 0

    preds_val = tcn.model.predict(valT[0])
    preds_val[preds_val < 0] = 0

    # Inverse transform predictions and ground truth if using scaling
    preds_val_original = scaler.inverse_transform(preds_val)
    y_val_original = scaler.inverse_transform(valT[1])
    preds_tra_original = scaler.inverse_transform(preds_tra)
    y_train_original = scaler.inverse_transform(trainT[1])

    # Collect results by department
    results = []

    for department_idx, department_name in DEP_NAMES.items():
        department_rows_val = validation_dataframe[validation_dataframe['dep_id'] == department_idx]
        department_rows_train = training_dataframe[training_dataframe['dep_id'] == department_idx]

        if department_rows_val.empty or department_rows_train.empty:
            continue

        department_indices_val = department_rows_val.index.tolist()
        department_indices_train = department_rows_train.index.tolist()

        matching_indices_val = y_val_indices_df[y_val_indices_df['actual_index'].isin(department_indices_val)].index
        matching_indices_train = y_train_indices_df[y_train_indices_df['actual_index'].isin(department_indices_train)].index

        if matching_indices_val.empty or matching_indices_train.empty:
            continue

        # Split into DengRate_all (col 0) and DengRate_019 (col 1)
        true_dengrate_all_val = y_val_original[matching_indices_val, 0]
        true_dengrate_019_val = y_val_original[matching_indices_val, 1]
        predicted_dengrate_all_val = preds_val_original[matching_indices_val, 0]
        predicted_dengrate_019_val = preds_val_original[matching_indices_val, 1]

        true_dengrate_all_train = y_train_original[matching_indices_train, 0]
        true_dengrate_019_train = y_train_original[matching_indices_train, 1]
        predicted_dengrate_all_train = preds_tra_original[matching_indices_train, 0]
        predicted_dengrate_019_train = preds_tra_original[matching_indices_train, 1]

        # Calculate metrics for DengRate_all
        mse_dengrate_all_val = calculate_mse(true_dengrate_all_val, predicted_dengrate_all_val)
        mse_dengrate_all_train = calculate_mse(true_dengrate_all_train, predicted_dengrate_all_train)

        rmse_dengrate_all_val = calculate_rmse(mse_dengrate_all_val)
        rmse_dengrate_all_train = calculate_rmse(mse_dengrate_all_train)

        mae_dengrate_all_val = calculate_mae(true_dengrate_all_val, predicted_dengrate_all_val)
        mae_dengrate_all_train = calculate_mae(true_dengrate_all_train, predicted_dengrate_all_train)

        mape_dengrate_all_val = calculate_mape(true_dengrate_all_val, predicted_dengrate_all_val)
        mape_dengrate_all_train = calculate_mape(true_dengrate_all_train, predicted_dengrate_all_train)

        r2_dengrate_all_val = calculate_r2(true_dengrate_all_val, predicted_dengrate_all_val)
        r2_dengrate_all_train = calculate_r2(true_dengrate_all_train, predicted_dengrate_all_train)

        # Calculate metrics for DengRate_019
        mse_dengrate_019_val = calculate_mse(true_dengrate_019_val, predicted_dengrate_019_val)
        mse_dengrate_019_train = calculate_mse(true_dengrate_019_train, predicted_dengrate_019_train)

        rmse_dengrate_019_val = calculate_rmse(mse_dengrate_019_val)
        rmse_dengrate_019_train = calculate_rmse(mse_dengrate_019_train)

        mae_dengrate_019_val = calculate_mae(true_dengrate_019_val, predicted_dengrate_019_val)
        mae_dengrate_019_train = calculate_mae(true_dengrate_019_train, predicted_dengrate_019_train)

        mape_dengrate_019_val = calculate_mape(true_dengrate_019_val, predicted_dengrate_019_val)
        mape_dengrate_019_train = calculate_mape(true_dengrate_019_train, predicted_dengrate_019_train)

        r2_dengrate_019_val = calculate_r2(true_dengrate_019_val, predicted_dengrate_019_val)
        r2_dengrate_019_train = calculate_r2(true_dengrate_019_train, predicted_dengrate_019_train)

        # Discretize predictions for confusion matrix (if you want binary classification)
        true_dengrate_all_val_bin = discretize_to_binary(true_dengrate_all_val)
        predicted_dengrate_all_val_bin = discretize_to_binary(predicted_dengrate_all_val)

        true_dengrate_all_train_bin = discretize_to_binary(true_dengrate_all_train)
        predicted_dengrate_all_train_bin = discretize_to_binary(predicted_dengrate_all_train)

        # Append results for this department
        results.append({
            'Department': department_name,
            'MAE (DengRate_all) Val': mae_dengrate_all_val,
            'RMSE (DengRate_all) Val': rmse_dengrate_all_val,
            'MAPE (DengRate_all) Val': mape_dengrate_all_val,
            'R2 (DengRate_all) Val': r2_dengrate_all_val,
            'MSE (DengRate_all) Val': mse_dengrate_all_val,

            'MAE (DengRate_all) Train': mae_dengrate_all_train,
            'RMSE (DengRate_all) Train': rmse_dengrate_all_train,
            'MAPE (DengRate_all) Train': mape_dengrate_all_train,
            'R2 (DengRate_all) Train': r2_dengrate_all_train,
            'MSE (DengRate_all) Train': mse_dengrate_all_train,

            'MAE (DengRate_019) Val': mae_dengrate_019_val,
            'RMSE (DengRate_019) Val': rmse_dengrate_019_val,
            'MAPE (DengRate_019) Val': mape_dengrate_019_val,
            'R2 (DengRate_019) Val': r2_dengrate_019_val,
            'MSE (DengRate_019) Val': mse_dengrate_019_val,

            'MAE (DengRate_019) Train': mae_dengrate_019_train,
            'RMSE (DengRate_019) Train': rmse_dengrate_019_train,
            'MAPE (DengRate_019) Train': mape_dengrate_019_train,
            'R2 (DengRate_019) Train': r2_dengrate_019_train,
            'MSE (DengRate_019) Train': mse_dengrate_019_train,
        })

        # Create or update DataFrame for metrics
        results_df = pd.DataFrame(results)
        today = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
        out_csv = os.path.join(config['metrics'], "Brazil",
                               f'TCN_new_model_search_{today}.csv')
        os.makedirs(os.path.dirname(out_csv), exist_ok=True)
        results_df.to_csv(out_csv, index=False)
        print(f"Results saved to {out_csv}")

        # Optional: Plot confusion matrices or ROC curves
        # plot_confusion_matrix(true_dengrate_all_val_bin, predicted_dengrate_all_val_bin, department_name, "Validation")
        # plot_confusion_matrix(true_dengrate_all_train_bin, predicted_dengrate_all_train_bin, department_name, "Training")
        #
        # fpr, tpr, _ = roc_curve(true_dengrate_all_val_bin, predicted_dengrate_all_val_bin)
        # roc_auc = auc(fpr, tpr)
        # plt.figure(figsize=(8, 6))
        # plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        # plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        # plt.xlim([0.0, 1.0])
        # plt.ylim([0.0, 1.05])
        # plt.xlabel('False Positive Rate')
        # plt.ylabel('True Positive Rate')
        # plt.title('Receiver Operating Characteristic (ROC) Curve')
        # plt.legend(loc='lower right')
        # plt.show()


In [None]:
print(f"y_val_original shape: {y_val_original.shape}")
print(f"matching_indices_val: {matching_indices_val}")
print(f"y_train_original shape: {y_train_original.shape}")
print(f"matching_indices_train: {matching_indices_train}")

In [None]:
mse_dengrate_all = mean_squared_error(y_val_original[:, 0], preds_val_original[:, 0])
rmse_dengrate_all = np.sqrt(mse_dengrate_all)
mae_dengrate_all = mean_absolute_error(y_val_original[:, 0], preds_val_original[:, 0])
mape_dengrate_all = mean_absolute_percentage_error(y_val_original[:, 0], preds_val_original[:, 0])
r2_dengrate_all = r2_score(y_val_original[:, 0], preds_val_original[:, 0])

# For the second output (DengRate_019):
mse_dengrate_019 = mean_squared_error(y_val_original[:, 1], preds_val_original[:, 1])
rmse_dengrate_019 = np.sqrt(mse_dengrate_019)
mae_dengrate_019 = mean_absolute_error(y_val_original[:, 1], preds_val_original[:, 1])
mape_dengrate_019 = mean_absolute_percentage_error(y_val_original[:, 1], preds_val_original[:, 1])
r2_dengrate_019 = r2_score(y_val_original[:, 1], preds_val_original[:, 1])

# Print the results
print(f"MSE DengRate_all: {mse_dengrate_all}, RMSE DengRate_all: {rmse_dengrate_all}, MAE DengRate_all: {mae_dengrate_all}, MAPE DengRate_all: {mape_dengrate_all}, R2 DengRate_all: {r2_dengrate_all}")
print(f"MSE DengRate_019: {mse_dengrate_019}, RMSE DengRate_019: {rmse_dengrate_019}, MAE DengRate_019: {mae_dengrate_019}, MAPE DengRate_019: {mape_dengrate_019}, R2 DengRate_019: {r2_dengrate_019}")

In [None]:
# import numpy as np
# import pandas as pd
# import shap
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
# import glob
# from datetime import datetime
# from sklearn.metrics import (
#     mean_absolute_percentage_error, 
#     mean_squared_error, 
#     r2_score, 
#     mean_absolute_error, 
#     confusion_matrix
# )


# def calculate_metrics(y_true, y_pred):
#     """Calculate regression metrics."""
#     return {
#         'MAE': mean_absolute_error(y_true, y_pred),
#         'MSE': mean_squared_error(y_true, y_pred),
#         'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
#         'MAPE': mean_absolute_percentage_error(y_true, y_pred),
#         'R2': r2_score(y_true, y_pred)
#     }


    
# # Define Paths
# output_path = os.path.join(config['output'], "Brazil")
# os.makedirs(output_path, exist_ok=True)

# #  Load Latest Model
# print("🔄 Checking for saved models...")
# tcn_models = sorted(glob.glob(os.path.join(output_path, "TCN-*.keras")), key=os.path.getmtime, reverse=True)

# if tcn_models:
#     latest_model = tcn_models[0]  # Get the most recent model
#     print(f"✅ Latest model found: {latest_model}")
    
#     # Initialize TCN model with correct input shape before loading weights
#     tcn = ImprovedTCNNet(trainingT[0].shape[1:])
    
#     try:
#         tcn.load(latest_model)
#         print(f"✅ Model successfully loaded from: {latest_model}")
#     except Exception as e:
#         print(f"❌ Error loading model: {e}")
#         exit()
# else:
#     print("❌ No saved model found in path:", output_path)
#     exit()

# # Prepare Data
# trainT, valT = dataset_handler.prepare_data_LSTM(x_train[:,:,2:], y_train, x_val[:,:,2:], y_val)

# # Make Predictions
# print("🔄 Making Predictions...")
# preds_train = np.maximum(tcn.model.predict(trainT[0]), 0)
# preds_val = np.maximum(tcn.model.predict(valT[0]), 0)

# # Reverse Scaling
# preds_train_original = scaler.inverse_transform(preds_train)
# preds_val_original = scaler.inverse_transform(preds_val)
# y_train_original = scaler.inverse_transform(trainT[1])
# y_val_original = scaler.inverse_transform(valT[1])

# # Store Results
# results = []

# for department_idx, department_name in DEP_NAMES.items():
#     print(f"🔍 Processing Department: {department_name}")

#     department_rows_val = validation_dataframe[validation_dataframe['dep_id'] == department_idx]
#     department_rows_train = training_dataframe[training_dataframe['dep_id'] == department_idx]

#     if department_rows_val.empty or department_rows_train.empty:
#         print(f"⚠️ Skipping {department_name}: No data found in validation or training sets")
#         continue

#     department_indices_val = department_rows_val.index.tolist()
#     department_indices_train = department_rows_train.index.tolist()

#     matching_indices_val = np.intersect1d(val_indices, department_indices_val)
#     matching_indices_train = np.intersect1d(train_indices, department_indices_train)

#     if matching_indices_val.size == 0 or matching_indices_train.size == 0:
#         print(f"⚠️ Skipping {department_name}: No valid indices found after filtering")
#         continue

    
    
#     # Filter out invalid indices for validation set
#     valid_indices_val = [idx for idx in matching_indices_val if idx < y_val_original.shape[0]]
#     print(f"Filtered validation indices: {valid_indices_val}")
#     if not valid_indices_val:
#         print(f"⚠️ Skipping {department_name}: No valid indices found for validation set")
#         continue
    
#     # Extract true and predicted values for validation set
#     true_dengrate_all_val = y_val_original[valid_indices_val, 0]
#     predicted_dengrate_all_val = preds_val_original[valid_indices_val, 0]
    
#     # Filter out invalid indices for training set
#     valid_indices_train = [idx for idx in matching_indices_train if idx < y_train_original.shape[0]]
#     print(f"Filtered training indices: {valid_indices_train}")
#     if not valid_indices_train:
#         print(f"⚠️ Skipping {department_name}: No valid indices found for training set")
#         continue
    
#     # Extract true and predicted values for training set
#     true_dengrate_all_train = y_train_original[valid_indices_train, 0]
#     predicted_dengrate_all_train = preds_train_original[valid_indices_train, 0]
    
#     # Calculate and store metrics
#     metrics_all_val = calculate_metrics(true_dengrate_all_val, predicted_dengrate_all_val)
#     metrics_all_train = calculate_metrics(true_dengrate_all_train, predicted_dengrate_all_train)
    
#     results.append({
#         'Department': department_name,
#         **{f'{key} (DengRate_all) Val': value for key, value in metrics_all_val.items()},
#         **{f'{key} (DengRate_all) Train': value for key, value in metrics_all_train.items()},
#     })

# # Save Results
# results_df = pd.DataFrame(results)
# metrics_path = os.path.join(config['metrics'], "Brazil", f'TCN_metrics_{datetime.now().strftime('%d-%m-%Y-%H-%M-%S')}.csv')
# os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
# results_df.to_csv(metrics_path, index=False)
# print(f"✅ Results saved to {metrics_path}")

# # Apply SHAP Explanation
# feature_names = list(training_dataframe.columns[2:])
# plot_shap_explanation(tcn.model, trainT[0], feature_names)


In [None]:
# import numpy as np
# import pandas as pd
# import shap
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
# import glob
# from datetime import datetime
# from sklearn.metrics import (
#     mean_absolute_percentage_error, 
#     mean_squared_error, 
#     r2_score, 
#     mean_absolute_error, 
#     confusion_matrix
# )


# def calculate_metrics(y_true, y_pred):
#     """Calculate regression metrics."""
#     return {
#         'MAE': mean_absolute_error(y_true, y_pred),
#         'MSE': mean_squared_error(y_true, y_pred),
#         'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
#         'MAPE': mean_absolute_percentage_error(y_true, y_pred),
#         'R2': r2_score(y_true, y_pred)
#     }


    
# # Define Paths
# output_path = os.path.join(config['output'], "Brazil")
# os.makedirs(output_path, exist_ok=True)

# #  Load Latest Model
# print("🔄 Checking for saved models...")
# tcn_models = sorted(glob.glob(os.path.join(output_path, "TCN-*.keras")), key=os.path.getmtime, reverse=True)

# if tcn_models:
#     latest_model = tcn_models[0]  # Get the most recent model
#     print(f"✅ Latest model found: {latest_model}")
    
#     # Initialize TCN model with correct input shape before loading weights
#     tcn = ImprovedTCNNet(trainingT[0].shape[1:])
    
#     try:
#         tcn.load(latest_model)
#         print(f"✅ Model successfully loaded from: {latest_model}")
#     except Exception as e:
#         print(f"❌ Error loading model: {e}")
#         exit()
# else:
#     print("❌ No saved model found in path:", output_path)
#     exit()

# # Prepare Data
# trainT, valT = dataset_handler.prepare_data_LSTM(x_train[:,:,2:], y_train, x_val[:,:,2:], y_val)

# # Make Predictions
# print("🔄 Making Predictions...")
# preds_train = np.maximum(tcn.model.predict(trainT[0]), 0)
# preds_val = np.maximum(tcn.model.predict(valT[0]), 0)

# # Reverse Scaling
# preds_train_original = scaler.inverse_transform(preds_train)
# preds_val_original = scaler.inverse_transform(preds_val)
# y_train_original = scaler.inverse_transform(trainT[1])
# y_val_original = scaler.inverse_transform(valT[1])

# # Store Results
# results = []

# for department_idx, department_name in DEP_NAMES.items():
#     print(f"🔍 Processing Department: {department_name}")

#     department_rows_val = validation_dataframe[validation_dataframe['dep_id'] == department_idx]
#     department_rows_train = training_dataframe[training_dataframe['dep_id'] == department_idx]

#     if department_rows_val.empty or department_rows_train.empty:
#         print(f"⚠️ Skipping {department_name}: No data found in validation or training sets")
#         continue

#     department_indices_val = department_rows_val.index.tolist()
#     department_indices_train = department_rows_train.index.tolist()

#     matching_indices_val = np.intersect1d(val_indices, department_indices_val)
#     matching_indices_train = np.intersect1d(train_indices, department_indices_train)

#     if matching_indices_val.size == 0 or matching_indices_train.size == 0:
#         print(f"⚠️ Skipping {department_name}: No valid indices found after filtering")
#         continue


    
#     # Extract true and predicted values
#     true_dengrate_all_val = y_val_original[matching_indices_val, 0]
#     predicted_dengrate_all_val = preds_val_original[matching_indices_val, 0]

#     true_dengrate_all_train = y_train_original[matching_indices_train, 0]
#     predicted_dengrate_all_train = preds_train_original[matching_indices_train, 0]

#     # Calculate and store metrics
#     metrics_all_val = calculate_metrics(true_dengrate_all_val, predicted_dengrate_all_val)
#     metrics_all_train = calculate_metrics(true_dengrate_all_train, predicted_dengrate_all_train)

#     results.append({
#         'Department': department_name,
#         **{f'{key} (DengRate_all) Val': value for key, value in metrics_all_val.items()},
#         **{f'{key} (DengRate_all) Train': value for key, value in metrics_all_train.items()},
#     })

# # Save Results
# results_df = pd.DataFrame(results)
# metrics_path = os.path.join(config['metrics'], "Brazil", f'TCN_metrics_{datetime.now().strftime('%d-%m-%Y-%H-%M-%S')}.csv')
# os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
# results_df.to_csv(metrics_path, index=False)
# print(f"✅ Results saved to {metrics_path}")

# # Apply SHAP Explanation
# feature_names = list(training_dataframe.columns[2:])
# plot_shap_explanation(tcn.model, trainT[0], feature_names)


In [None]:
# print(f"y_val_original shape: {y_val_original.shape}")
# print(f"matching_indices_val: {matching_indices_val}")
# print(f"y_train_original shape: {y_train_original.shape}")
# print(f"matching_indices_train: {matching_indices_train}")

In [None]:
# import glob
# import os

# # Get all saved model files
# tcn_models = glob.glob(os.path.join(output_path, "TCN-new-search-*.keras"))

# if not tcn_models:
#     print('❌ No saved model found. Run TRAINING = True first.')
#     exit()
# # else:
#     # Sort models by modification time (newest first)
#     tcn_models.sort(key=os.path.getmtime, reverse=True)
    
#     latest_model = tcn_models[0]  # Pick the latest model
#     tcn.load(latest_model)
#     print(f"✅ Model loaded from: {latest_model}")


In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers, models, regularizers, Input
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.optimizers.schedules import ExponentialDecay
# import matplotlib.pyplot as plt
# import os
# import glob
# import numpy as np
# import pandas as pd
# import seaborn as sns
# from datetime import datetime
# from sklearn.metrics import (
#     mean_absolute_percentage_error, 
#     mean_squared_error, 
#     r2_score, 
#     mean_absolute_error, 
#     confusion_matrix
# )

# # Training Configuration
# LSTM_SETTINGS = {
#     'EPOCHS': 150,
#     'LEARNING RATE': 0.0001,
#     'BATCH SIZE': 32,
#     'EARLY STOPPING': 20
# }

# # Define paths
# output_path = os.path.join(config['output'], "Brazil")
# os.makedirs(output_path, exist_ok=True)

# # Training Flag
# TRAINING = True  # Set to False for evaluation

# # Function to build TCN model
# def build_tcn_model_v2(input_shape, output_units):
#     def residual_block(x, filters, dilation_rate):
#         shortcut = layers.Conv1D(filters, kernel_size=1, padding='same')(x)
#         x = layers.Conv1D(filters, kernel_size=3, padding='causal', activation='relu', dilation_rate=dilation_rate,
#                           kernel_regularizer=regularizers.l2(1e-4))(x)
#         x = layers.BatchNormalization()(x)
#         x = layers.Dropout(0.3)(x)
#         x = layers.add([x, shortcut])
#         x = layers.ReLU()(x)
#         return x

#     inputs = Input(shape=input_shape)
#     x = layers.Conv1D(64, kernel_size=3, padding='causal', activation='relu')(inputs)
#     x = layers.BatchNormalization()(x)
#     x = layers.Dropout(0.2)(x)

#     for dilation_rate in [1, 2, 4, 8]:
#         x = residual_block(x, 64, dilation_rate)

#     x = layers.GlobalAveragePooling1D()(x)
#     outputs = layers.Dense(output_units, activation='linear')(x)

#     optimizer = tf.keras.optimizers.Adam(learning_rate=ExponentialDecay(
#         LSTM_SETTINGS['LEARNING RATE'], decay_steps=1000, decay_rate=0.96, staircase=True))
    
#     model = models.Model(inputs, outputs)
#     model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
#     return model

# # Improved TCN class
# class ImprovedTCNNet:
#     def __init__(self, shape, output_units=2):
#         self.shape = shape
#         self.model = build_tcn_model_v2(self.shape, output_units)
#         self.epochs = LSTM_SETTINGS['EPOCHS']
#         self.batch_size = LSTM_SETTINGS['BATCH SIZE']
#         self.early_stopping_rounds = LSTM_SETTINGS['EARLY STOPPING']
    
#     def load(self, model_path):
#         self.model = tf.keras.models.load_model(model_path)
#         print(f"✅ Model loaded from {model_path}")

#     def train(self, training, validation, output_path):
#         """Train the TCN model with early stopping."""
#         es = EarlyStopping(monitor='val_loss', patience=self.early_stopping_rounds, restore_best_weights=True)
#         history = self.model.fit(
#             x=training[0], y=training[1],
#             validation_data=(validation[0], validation[1]),
#             epochs=self.epochs, batch_size=self.batch_size,
#             callbacks=[es], shuffle=True)

#         plt.plot(history.history['loss'], label='Train Loss')
#         plt.plot(history.history['val_loss'], label='Validation Loss')
#         plt.legend()
#         plt.title("Training Loss Curve")
#         plt.show()

#         model_filename = f"TCN-{datetime.now().strftime('%d-%m-%Y-%H-%M-%S')}.keras"
#         save_path = os.path.join(output_path, model_filename)
#         self.model.save(save_path)
#         print(f"✅ Model saved at {save_path}")
#         return history

# # Prepare data for training
# trainingT, validationT = dataset_handler.prepare_data_LSTM(x_train_a[:,:,2:], y_train_a, x_val_a[:,:,2:], y_val_a)

# # Initialize Model
# tcn = ImprovedTCNNet(trainingT[0].shape[1:])

# # **TRAIN THE MODEL BEFORE MAKING PREDICTIONS**
# if TRAINING:
#     print("🔄 Training the model...")
#     tcn.train(trainingT, validationT, output_path)
# else:
#     # **Load Latest Saved Model for Evaluation**
#     print("🔄 Checking for saved models...")
#     tcn_models = sorted(glob.glob(os.path.join(output_path, "TCN-*.keras")), key=os.path.getmtime, reverse=True)
#     if tcn_models:
#         latest_model = tcn_models[0]
#         tcn.load(latest_model)
#         print(f"✅ Model loaded from: {latest_model}")
#     else:
#         print("❌ No saved model found.")
#         exit()

# # **MAKE PREDICTIONS AFTER TRAINING OR LOADING**
# print("🔄 Making Predictions...")
# preds_train = np.maximum(tcn.model.predict(trainingT[0]), 0)
# preds_val = np.maximum(tcn.model.predict(validationT[0]), 0)

# # **Reverse Scaling**
# preds_train_original = scaler.inverse_transform(preds_train)
# preds_val_original = scaler.inverse_transform(preds_val)
# y_train_original = scaler.inverse_transform(trainingT[1])
# y_val_original = scaler.inverse_transform(validationT[1])

# # **Save Model Predictions**
# results = []

# for department_idx, department_name in DEP_NAMES.items():
#     print(f"🔍 Processing Department: {department_name}")

#     department_rows_val = validation_dataframe[validation_dataframe['dep_id'] == department_idx]
#     department_rows_train = training_dataframe[training_dataframe['dep_id'] == department_idx]

#     if department_rows_val.empty or department_rows_train.empty:
#         print(f"⚠️ Skipping {department_name}: No data found in validation or training sets")
#         continue

#     department_indices_val = department_rows_val.index.tolist()
#     department_indices_train = department_rows_train.index.tolist()

#     matching_indices_val = np.intersect1d(val_indices, department_indices_val)
#     matching_indices_train = np.intersect1d(train_indices, department_indices_train)

#     if matching_indices_val.size == 0 or matching_indices_train.size == 0:
#         print(f"⚠️ Skipping {department_name}: No valid indices found after filtering")
#         continue

#     # Extract true and predicted values
#     true_dengrate_all_val = y_val_original[matching_indices_val, 0]
#     predicted_dengrate_all_val = preds_val_original[matching_indices_val, 0]

#     true_dengrate_all_train = y_train_original[matching_indices_train, 0]
#     predicted_dengrate_all_train = preds_train_original[matching_indices_train, 0]

#     # Calculate and store metrics
#     metrics_all_val = calculate_metrics(true_dengrate_all_val, predicted_dengrate_all_val)
#     metrics_all_train = calculate_metrics(true_dengrate_all_train, predicted_dengrate_all_train)

#     results.append({
#         'Department': department_name,
#         **{f'{key} (DengRate_all) Val': value for key, value in metrics_all_val.items()},
#         **{f'{key} (DengRate_all) Train': value for key, value in metrics_all_train.items()},
#     })

# # Save Results
# results_df = pd.DataFrame(results)
# metrics_path = os.path.join(config['metrics'], "Brazil", f'TCN_metrics_{datetime.now().strftime('%d-%m-%Y-%H-%M-%S')}.csv")
# os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
# results_df.to_csv(metrics_path, index=False)
# print(f"✅ Results saved to {metrics_path}")

# # Apply SHAP after training
# feature_names = list(training_dataframe.columns[2:])
# plot_shap_explanation(tcn.model, trainingT[0], feature_names)


In [None]:
# import os
# import numpy as np
# import matplotlib.pyplot as plt
# import pandas as pd
# from datetime import datetime
# from glob import glob  # Needed if you use glob(...) below

# # Sklearn metrics
# from sklearn.metrics import (mean_squared_error, mean_absolute_error,
#                             mean_absolute_percentage_error, r2_score,
#                             roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay)

# # Tensorflow imports (ensure these are present if not already imported)
# import tensorflow as tf
# from tensorflow.keras import layers, models, regularizers, Input
# from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# from tensorflow.keras.optimizers import Adam
# import tensorflow.keras.backend as K

# # Define output path - assuming config is defined elsewhere as per previous corrections
# output_path = os.path.join(config['output'], "Brazil")
# os.makedirs(output_path, exist_ok=True)

# # Define training flag
# TRAINING = True  # Set to True to train the model

# # Utility functions (same as before)
# def calculate_nrmse(true_values, predicted_values):
#     rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
#     return rmse / (true_values.max() - true_values.min())

# def calculate_mae(true_values, predicted_values):
#     return mean_absolute_error(true_values, predicted_values)

# def calculate_mse(true_values, predicted_values):
#     return mean_squared_error(true_values, predicted_values)

# def calculate_rmse(mse):
#     return mse ** 0.5

# def calculate_mape(true_values, predicted_values):
#     return mean_absolute_percentage_error(true_values, predicted_values)

# def calculate_r2(true_values, predicted_values):
#     return r2_score(true_values, predicted_values)

# def discretize_to_binary(values, threshold=0.5):
#     return np.where(values > threshold, 1, 0)

# def plot_confusion_matrix(true_values, predicted_values, department_name, metric_type="All"):
#     cm = confusion_matrix(true_values, predicted_values)
#     fig, ax = plt.subplots(figsize=(6, 6))
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#     disp.plot(ax=ax, cmap='Blues', values_format='d')
#     ax.set_title(f'Confusion Matrix: {department_name} - {metric_type}')
#     plt.show()

# # ------------------------------------------------------------------------------
# # Modified TCN Model Definition to accept num_filters
# # ------------------------------------------------------------------------------
# LSTM_SETTINGS = { # Keep LSTM_SETTINGS for training parameters
#     'EPOCHS': 200,
#     'LEARNING RATE': 0.0001,
#     'BATCH SIZE': 16,
#     'OPTIMZER': 'rmsprop', # Not used directly in this code, kept for reference
#     'LOSS': 'mae',
#     'EVALUATION METRIC': ['mse'],
#     'EARLY STOPPING': 12
# }

# def build_tcn_model_v2(input_shape, output_units, num_filters=64): # ADDED num_filters with default
#     """Builds the TCN model structure with adjustable number of filters."""
#     def residual_block(x, filters, dilation_rate): # filters now comes from outer function
#         shortcut = x  # Residual connection

#         x = layers.Conv1D(
#             filters=filters, # Use 'filters' argument
#             kernel_size=3,
#             padding='causal',
#             activation='relu',
#             dilation_rate=dilation_rate,
#             kernel_regularizer=regularizers.l2(1e-4)
#         )(x)
#         x = layers.BatchNormalization()(x)
#         x = layers.Dropout(0.3)(x)

#         x = layers.Conv1D(
#             filters=filters, # Use 'filters' argument
#             kernel_size=3,
#             padding='causal',
#             activation='relu',
#             dilation_rate=dilation_rate,
#             kernel_regularizer=regularizers.l2(1e-4)
#         )(x)
#         x = layers.BatchNormalization()(x)
#         x = layers.Dropout(0.3)(x)

#         x = layers.add([x, shortcut])
#         x = layers.ReLU()(x)
#         return x

#     inputs = Input(shape=input_shape)
#     x = layers.Conv1D(
#         filters=num_filters, # Use 'num_filters' for initial layer
#         kernel_size=3,
#         padding='causal',
#         activation='relu'
#     )(inputs)
#     x = layers.BatchNormalization()(x)
#     x = layers.Dropout(0.2)(x)

#     for dilation_rate in [1, 2, 4, 8]:
#         x = residual_block(x, filters=num_filters, dilation_rate=dilation_rate) # Use 'num_filters' for residual blocks

#     x = layers.Flatten()(x)
#     outputs = layers.Dense(units=output_units, activation='linear')(x)

#     model = models.Model(inputs, outputs)
#     return model

# class ImprovedTCNNet:
#     """Wrapper class for TCN model, now with num_filters as parameter."""
#     def __init__(self, shape, output_units=2, num_filters=64): # ADDED num_filters with default
#         self.shape = shape
#         self.epochs = LSTM_SETTINGS['EPOCHS']
#         self.batch_size = LSTM_SETTINGS['BATCH SIZE']
#         self.lr = LSTM_SETTINGS['LEARNING RATE']
#         self.early_stopping_rounds = LSTM_SETTINGS['EARLY STOPPING']
#         self.num_filters = num_filters # Store num_filters

#         # Build the uncompiled model, passing num_filters
#         self.model = build_tcn_model_v2(self.shape, output_units, num_filters=self.num_filters)

#         # Compile the model (same as before)
#         self.model.compile(
#             optimizer=Adam(learning_rate=self.lr),
#             loss='Huber',
#             metrics=['mae']
#         )

#     def load(self, model_path):
#         """Loads a saved model."""
#         self.model = tf.keras.models.load_model(model_path)
#         print(f"Model loaded successfully from {model_path}")

#     def train(self, training, validation, output_path):
#         """Trains the TCN model."""
#         es = EarlyStopping(monitor='val_loss', patience=self.early_stopping_rounds, restore_best_weights=True)
#         lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
#         print(">>> Actual LR before fit:", K.get_value(self.model.optimizer.lr))
#         history = self.model.fit(
#             x=training[0], y=training[1],
#             validation_data=(validation[0], validation[1]),
#             epochs=self.epochs, batch_size=self.batch_size,
#             callbacks=[es, lr_scheduler], shuffle=True
#         )

#         plt.figure(figsize=(8, 6))
#         plt.plot(history.history['loss'], label='Train Loss')
#         plt.plot(history.history['val_loss'], label='Validation Loss')
#         plt.legend()
#         plt.title(f'Improved TCN Model Loss Curve (Filters={self.num_filters})') # Added num_filters to title
#         plt.xlabel('Epochs')
#         plt.ylabel('Loss')
#         plt.show()

#         today = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
#         model_filename = f"TCN-new-search-filters{self.num_filters}-{today}.keras" # Filename includes num_filters
#         os.makedirs(output_path, exist_ok=True)
#         save_path = os.path.join(output_path, model_filename)
#         self.model.save(save_path)
#         print(f"Model saved to {save_path}")
#         return history

# # ------------------------------------------------------------------------------
# # Main execution block (TRAINING and EVALUATION)
# # ------------------------------------------------------------------------------

# if TRAINING:
#     print("Training the model...")
#     # --- Instantiate ImprovedTCNNet here, you can now optionally set num_filters ---
#     trainingT, validationT = dataset_handler.prepare_data_LSTM(x_train[:,:,2:], y_train,
#                                                                  x_val[:,:,2:], y_val) # Assuming these are prepared
#     tcn = ImprovedTCNNet(trainingT[0].shape[1:], num_filters=128) # Example: Using 128 filters

#     history = tcn.train(training=trainingT,
#                           validation=validationT,
#                           output_path=os.path.join(config['output'], "Brazil"))
# else:
#     print("Checking for saved models...")
#     tcn_models = glob(os.path.join(output_path, "TCN-new-search-*.keras")) # Might need to adjust glob pattern to match new filenames
#     if not tcn_models:
#         print('No file with such pattern was found. Run TRAINING = True first.')
#         exit()
#     else:
#         tcn_models.sort(key=os.path.getmtime)
#         tcn.load(tcn_models[-1]) # Load latest model
#         print(f"Loading model from: {tcn_models[-1]}")

#     # --- Data preparation for evaluation (same as in the `else` block previously) ---
#     trainT, valT = dataset_handler.prepare_data_LSTM(x_train[:,:,2:], y_train,
#                                                         x_val[:,:,2:], y_val)

#     y_val_indices_df = pd.DataFrame(val_indices, columns=['actual_index'])
#     y_train_indices_df = pd.DataFrame(train_indices, columns=['actual_index'])

#     preds_tra = tcn.model.predict(trainT[0])
#     preds_tra[preds_tra < 0] = 0
#     preds_val = tcn.model.predict(valT[0])
#     preds_val[preds_val < 0] = 0

#     preds_val_original = scaler.inverse_transform(preds_val)
#     y_val_original = scaler.inverse_transform(valT[1])
#     preds_tra_original = scaler.inverse_transform(preds_tra)
#     y_train_original = scaler.inverse_transform(trainT[1])

#     results = []

#     for department_idx, department_name in DEP_NAMES.items():
#         department_rows_val = validation_dataframe[validation_dataframe['dep_id'] == department_idx]
#         department_rows_train = training_dataframe[training_dataframe['dep_id'] == department_idx]

#         if department_rows_val.empty or department_rows_train.empty:
#             continue

#         department_indices_val = department_rows_val.index.tolist()
#         department_indices_train = department_rows_train.index.tolist()

#         matching_indices_val = y_val_indices_df[y_val_indices_df['actual_index'].isin(department_indices_val)].index
#         matching_indices_train = y_train_indices_df[y_train_indices_df['actual_index'].isin(department_indices_train)].index

#         if matching_indices_val.empty or matching_indices_train.empty:
#             continue

#         true_dengrate_all_val = y_val_original[matching_indices_val, 0]
#         true_dengrate_019_val = y_val_original[matching_indices_val, 1]
#         predicted_dengrate_all_val = preds_val_original[matching_indices_val, 0]
#         predicted_dengrate_019_val = preds_val_original[matching_indices_val, 1]

#         true_dengrate_all_train = y_train_original[matching_indices_train, 0]
#         true_dengrate_019_train = y_train_original[matching_indices_train, 1]
#         predicted_dengrate_all_train = preds_tra_original[matching_indices_train, 0]
#         predicted_dengrate_019_train = preds_tra_original[matching_indices_train, 1]

#         mse_dengrate_all_val = calculate_mse(true_dengrate_all_val, predicted_dengrate_all_val)
#         mse_dengrate_all_train = calculate_mse(true_dengrate_all_train, predicted_dengrate_all_train)
#         rmse_dengrate_all_val = calculate_rmse(mse_dengrate_all_val)
#         rmse_dengrate_all_train = calculate_rmse(mse_dengrate_all_train)
#         mae_dengrate_all_val = calculate_mae(true_dengrate_all_val, predicted_dengrate_all_val)
#         mae_dengrate_all_train = calculate_mae(true_dengrate_all_train, predicted_dengrate_all_train)
#         mape_dengrate_all_val = calculate_mape(true_dengrate_all_val, predicted_dengrate_all_val)
#         mape_dengrate_all_train = calculate_mape(true_dengrate_all_train, predicted_dengrate_all_train)
#         r2_dengrate_all_val = calculate_r2(true_dengrate_all_val, predicted_dengrate_all_val)
#         r2_dengrate_all_train = calculate_r2(true_dengrate_all_train, predicted_dengrate_all_train)

#         mse_dengrate_019_val = calculate_mse(true_dengrate_019_val, predicted_dengrate_019_val)
#         mse_dengrate_019_train = calculate_mse(true_dengrate_019_train, predicted_dengrate_019_train)
#         rmse_dengrate_019_val = calculate_rmse(mse_dengrate_019_val)
#         rmse_dengrate_019_train = calculate_rmse(mse_dengrate_019_train)
#         mae_dengrate_019_val = calculate_mae(true_dengrate_019_val, predicted_dengrate_019_val)
#         mae_dengrate_019_train = calculate_mae(true_dengrate_019_train, predicted_dengrate_019_train)
#         mape_dengrate_019_val = calculate_mape(true_dengrate_019_val, predicted_dengrate_019_val)
#         mape_dengrate_019_train = calculate_mape(true_dengrate_019_train, predicted_dengrate_019_train)
#         r2_dengrate_019_val = calculate_r2(true_dengrate_019_val, predicted_dengrate_019_val)
#         r2_dengrate_019_train = calculate_r2(true_dengrate_019_train, predicted_dengrate_019_train)

#         true_dengrate_all_val_bin = discretize_to_binary(true_dengrate_all_val)
#         predicted_dengrate_all_val_bin = discretize_to_binary(predicted_dengrate_all_val)
#         true_dengrate_all_train_bin = discretize_to_binary(true_dengrate_all_train)
#         predicted_dengrate_all_train_bin = discretize_to_binary(predicted_dengrate_all_train)

#         results.append({
#             'Department': department_name,
#             'MAE (DengRate_all) Val': mae_dengrate_all_val,
#             'RMSE (DengRate_all) Val': rmse_dengrate_all_val,
#             'MAPE (DengRate_all) Val': mape_dengrate_all_val,
#             'R2 (DengRate_all) Val': r2_dengrate_all_val,
#             'MSE (DengRate_all) Val': mse_dengrate_all_val,
#             'MAE (DengRate_all) Train': mae_dengrate_all_train,
#             'RMSE (DengRate_all) Train': rmse_dengrate_all_train,
#             'MAPE (DengRate_all) Train': mape_dengrate_all_train,
#             'R2 (DengRate_all) Train': r2_dengrate_all_train,
#             'MSE (DengRate_all) Train': mse_dengrate_all_train,
#             'MAE (DengRate_019) Val': mae_dengrate_019_val,
#             'RMSE (DengRate_019) Val': rmse_dengrate_019_val,
#             'MAPE (DengRate_019) Val': mape_dengrate_019_val,
#             'R2 (DengRate_019) Val': r2_dengrate_019_val,
#             'MSE (DengRate_019) Val': mse_dengrate_019_val,
#             'MAE (DengRate_019) Train': mae_dengrate_019_train,
#             'RMSE (DengRate_019) Train': rmse_dengrate_019_train,
#             'MAPE (DengRate_019) Train': mape_dengrate_019_train,
#             'R2 (DengRate_019) Train': r2_dengrate_019_train,
#             'MSE (DengRate_019) Train': mse_dengrate_019_train,
#         })

#         results_df = pd.DataFrame(results)
#         today = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
#         out_csv = os.path.join(config['metrics'], "Brazil",
#                                  f'TCN_new_model_search_filters{tcn.num_filters}_{today}.csv') # Filename now includes filter count
#         os.makedirs(os.path.dirname(out_csv), exist_ok=True)
#         results_df.to_csv(out_csv, index=False)
#         print(f"Results saved to {out_csv}")

#         # Optional plotting - confusion matrices or ROC curves can be added here