In [None]:
import numpy as np
import pandas as pd
import pymc as pm
import matplotlib.pyplot as plt
import os
import shutil
from tqdm import tqdm
from scipy.stats import uniform, norm, multivariate_normal
from agent_based_model import load_data, preprocess_data
from main_pool import Main

In [None]:
class ABM_tuned:
    """Agent-Based model parameter estimation class"""
    
    def __init__(self, observed_data, data_path="./chelyabinsk_10/", days=range(1, 100)): # days to predict
        """Initializion"""
        self.observed_data = observed_data
        self.data_path = data_path
        self.days = days
        self.strains_keys = ['H1N1', 'H3N2', 'B']
        
        # we read data only once
        print("Loading and preprocessing data...")
        self.data, self.households, self.dict_school_id = load_data(data_path)
        self.data, self.households, self.dict_school_id = preprocess_data(
            self.data, self.households, self.dict_school_id)
        
        # store history matching results
        self.hm_results = None
    
    def simulator_function(self, alpha, lmbd):
        """Run ABM simulation with given parameters"""
        # temporary directory for simulation results
        sim_dir = f"temp_sim_{np.random.randint(0, 100000)}/"
        if not os.path.exists(sim_dir):
            os.makedirs(sim_dir)
        
        try:
            pool = Main(
                strains_keys=self.strains_keys,
                infected_init=[10, 0, 0],
                alpha=[float(alpha), float(alpha), float(alpha)],
                lmbd=float(lmbd)
            )
             
            pool.runs_params(
                num_runs=1,
                days=[1, len(self.days)],
                data_folder=self.data_path
            )
            
            pool.age_groups_params(
                age_groups=['0-10', '11-17', '18-59', '60-150'],
                vaccined_fraction=[0, 0, 0, 0]
            )
            
            pool.start(with_seirb=True)
            
            # load results of the simulation
            results_path = os.path.join(pool.results_dir, "prevalence_seed_0.csv")
            if os.path.exists(results_path):
                sim_results = pd.read_csv(results_path, sep='\t')
                return sim_results
            else:
                return None
            
        except Exception as e:
            print(f"Simulation error: {e}")
            return None
            
        finally:
            if os.path.exists(sim_dir):
                shutil.rmtree(sim_dir)
    
    def calculate_distance(self, sim_data):
        """ distance between simulated and observed data"""
        if sim_data is None:
            return np.inf
        
        try:
            min_len = min(len(self.observed_data), len(sim_data))
            obs = self.observed_data['H1N1'].values[:min_len]
            sim = sim_data['H1N1'].values[:min_len]
            
            # mean squared error
            distance = np.mean((obs - sim)**2)
            return distance
        except Exception as e:
            print(f"Error calculating distance: {e}")
            return np.inf
    
    def history_matching(self, prior_ranges, n_samples=100, epsilon=3, adaptive=False, accept_ratio=0.2):
        """history matching to find plausible parameter regions uses uniform random sampling within prior parameter ranges"""
        print(f"Running history matching with {n_samples} samples...") # n_samples means how many times we randomly pick the parameters

        # generate samples from prior ranges
        samples = []
        for _ in range(n_samples):
            sample = {}
            for param, (min_val, max_val) in prior_ranges.items():
                sample[param] = uniform.rvs(loc=min_val, scale=max_val-min_val)
            samples.append(sample)

        # run simulations and calculate distances
        results = []
        for sample in tqdm(samples):
            sim_data = self.simulator_function(sample["alpha"], sample["lmbd"])
            distance = self.calculate_distance(sim_data)
    
            # trajectory data
            result_dict = {
            "alpha": sample["alpha"],
            "lmbd": sample["lmbd"],
            "distance": distance
            }
        
            # add trajectory to results dictionary
            if sim_data is not None:
                result_dict["trajectory"] = sim_data["H1N1"].copy()
    
            results.append(result_dict)

        results_df = pd.DataFrame(results)

        # distance statistics
        #print(f"Distance stats: min={results_df['distance'].min()}, max={results_df['distance'].max()}, mean={results_df['distance'].mean()}")

        # filter results 
        if adaptive: # by acceptance ratio
            n_accept = max(1, int(len(results_df) * accept_ratio))
            accepted = results_df.nsmallest(n_accept, "distance")
        else: # fixed threshold
            accepted = results_df[results_df["distance"] < epsilon]

        print(f"Accepted {len(accepted)} parameter sets")
    
        # results for other methods to use
        self.hm_results = accepted

        return accepted

    
    def rejection_abc(self, n_samples=100, epsilon=1e-5, adaptive=True, accept_ratio=0.01):
        """ ABC rejection sampling based on history matching results"""
        if self.hm_results is None:
            raise ValueError("One must run history_matching before rejection_abc")
            
        print(f"Running ABC rejection with {n_samples} samples from History Matching...")
        
        # we use history matching results to define parameter ranges
        alpha_min = self.hm_results['alpha'].min()
        alpha_max = self.hm_results['alpha'].max()
        lmbd_min = self.hm_results['lmbd'].min()
        lmbd_max = self.hm_results['lmbd'].max()
        
        # sample from history matching parameter space
        samples = []
        for _ in range(n_samples):
            # randomly select a parameter set from history matching results
            hm_idx = np.random.randint(0, len(self.hm_results))
            hm_sample = self.hm_results.iloc[hm_idx]
            
            # add small perturbation to create a new sample
            alpha_perturb = uniform.rvs(loc=-0.05, scale=0.1)  # +- 0.05
            lmbd_perturb = uniform.rvs(loc=-0.05, scale=0.1)   # +- 0.05
            
            alpha = np.clip(hm_sample['alpha'] + alpha_perturb, alpha_min, alpha_max)
            lmbd = np.clip(hm_sample['lmbd'] + lmbd_perturb, lmbd_min, lmbd_max)
            
            samples.append({"alpha": alpha, "lmbd": lmbd})
            
        # run simulations and calculate distances
        results = []
        for sample in tqdm(samples):
            sim_data = self.simulator_function(sample["alpha"], sample["lmbd"])
            distance = self.calculate_distance(sim_data)
            
            result_dict = {
                "alpha": sample["alpha"],
                "lmbd": sample["lmbd"],
                "distance": distance
            }
            
            if sim_data is not None:
                result_dict["trajectory"] = sim_data["H1N1"].copy()
                
            results.append(result_dict)
            
        results_df = pd.DataFrame(results)
        
        if adaptive: # by acceptance ratio
            n_accept = max(1, int(len(results_df) * accept_ratio))
            accepted = results_df.nsmallest(n_accept, "distance")
        else: # FIXED THRESHOLD: select only parameters with distance < epsilon
            if len(accepted) == 0:
                print(f"No samples accepted at epsilon = {epsilon}. Taking 10% of the best samples.")
                accepted = results_df.nsmallest(max(1, int(n_samples * 0.1)), "distance")
            else:
                accepted = results_df[results_df["distance"] < epsilon]
    
            #accepted = results_df.nsmallest(max(1, int(n_samples * 0.1)), "distance")
        
        return accepted
    
    def annealing_abc(self, n_samples=100, initial_epsilon=1e-3, final_epsilon=1e-5, cooling_steps=3, adaptive=True, accept_ratio=0.01):
        """ ABC simulated annealing using history matching results"""
        if self.hm_results is None:
            raise ValueError("One must run history_matching before annealing_abc")
            
        print(f"Running ABC annealing with {cooling_steps} cooling steps...")
        
        # epsilon values for each step
        epsilons = np.geomspace(initial_epsilon, final_epsilon, cooling_steps)
        
        # parameter bounds from history matching
        alpha_min = self.hm_results['alpha'].min()
        alpha_max = self.hm_results['alpha'].max()
        lmbd_min = self.hm_results['lmbd'].min()
        lmbd_max = self.hm_results['lmbd'].max()
        
        # initial samples from history matching results
        current_samples = []
        for _ in range(n_samples):
            # randomly select a parameter set from history matching results
            hm_idx = np.random.randint(0, len(self.hm_results))
            hm_sample = self.hm_results.iloc[hm_idx]
            
            current_samples.append({
                "alpha": hm_sample["alpha"], 
                "lmbd": hm_sample["lmbd"]
            })
    
        # annealing process
        for step, epsilon in enumerate(epsilons):
            print(f"Annealing step {step+1}/{cooling_steps}, epsilon = {epsilon:.6f}")
            
            # evaluate current samples
            results = []
            for sample in tqdm(current_samples):
                sim_data = self.simulator_function(sample["alpha"], sample["lmbd"])
                distance = self.calculate_distance(sim_data)
                
                result_dict = {
                    "alpha": sample["alpha"],
                    "lmbd": sample["lmbd"],
                    "distance": distance
                }
                
                if sim_data is not None:
                    result_dict["trajectory"] = sim_data["H1N1"].copy()
                
                results.append(result_dict)
            
            # Filter accepted samples
            results_df = pd.DataFrame(results)
            #accepted = results_df[results_df["distance"] < epsilon]
            
            #if len(accepted) == 0:
                #print(f"No samples accepted at epsilon = {epsilon}. Taking best samples.")
                #accepted = results_df.nsmallest(max(1, int(n_samples * 0.1)), "distance")

            if adaptive: # by acceptance ratio
                n_accept = max(1, int(len(results_df) * accept_ratio))
                accepted = results_df.nsmallest(n_accept, "distance")
            else: # FIXED THRESHOLD: select only parameters with distance < epsilon
                if len(accepted) == 0:
                    print(f"No samples accepted at epsilon = {epsilon}. Taking 10% of the best samples.")
                    accepted = results_df.nsmallest(max(1, int(n_samples * 0.1)), "distance")
                else:
                    accepted = results_df[results_df["distance"] < epsilon]
    
            
            # new samples for next iteration
            if step < cooling_steps - 1:
                # mean of accepted parameters
                alpha_mean = accepted["alpha"].mean()
                lmbd_mean = accepted["lmbd"].mean()
                
                # ensure the variances are strictly positive
                alpha_var = accepted["alpha"].var()
                if np.isnan(alpha_var) or alpha_var <= 1e-6:
                    alpha_var = 1e-4  # minimum value
                
                lmbd_var = accepted["lmbd"].var()
                if np.isnan(lmbd_var) or lmbd_var <= 1e-6:
                    lmbd_var = 1e-4  # minimum value
                
                # new samples from normal distribution around accepted values
                current_samples = []
                for _ in range(n_samples):
                    try:
                        alpha = norm.rvs(loc=alpha_mean, scale=np.sqrt(alpha_var))
                        lmbd = norm.rvs(loc=lmbd_mean, scale=np.sqrt(lmbd_var))
                    except ValueError:
                        # fallback if there's an error
                        perturb_scale = 0.05  # 5% perturbation
                        alpha = alpha_mean + np.random.uniform(-perturb_scale, perturb_scale) * (alpha_max - alpha_min)
                        lmbd = lmbd_mean + np.random.uniform(-perturb_scale, perturb_scale) * (lmbd_max - lmbd_min)
                    
                    # ensure parameters are within bounds from history matching
                    alpha = max(alpha_min, min(alpha, alpha_max))
                    lmbd = max(lmbd_min, min(lmbd, lmbd_max))
                    
                    current_samples.append({
                        "alpha": alpha,
                        "lmbd": lmbd
                    })
        
        return accepted
    
    def smc_abc(self, n_particles=100, n_populations=3, initial_epsilon=1e-3, final_epsilon=1e-5, adaptive=True, accept_ratio=0.01):
        """ABC Sequential Monte Carlo using history matching results"""
        if self.hm_results is None:
            raise ValueError("One must run history_matching before smc_abc")
            
        print(f"Running ABC-SMC with {n_populations} populations...")
        
        # epsilon sequence
        epsilons = np.geomspace(initial_epsilon, final_epsilon, n_populations)
        
        # parameter bounds from history matching
        alpha_min = self.hm_results['alpha'].min()
        alpha_max = self.hm_results['alpha'].max()
        lmbd_min = self.hm_results['lmbd'].min()
        lmbd_max = self.hm_results['lmbd'].max()
        
        # first population from history matching results
        particles = []
        for _ in range(n_particles):
            # randomly select a parameter set from history matching
            hm_idx = np.random.randint(0, len(self.hm_results))
            hm_sample = self.hm_results.iloc[hm_idx]
            
            particles.append({
                "alpha": hm_sample["alpha"], 
                "lmbd": hm_sample["lmbd"]
            })
        
        # equal weights for first population
        weights = np.ones(n_particles) / n_particles
        
        # SMC process
        for t in range(n_populations):
            epsilon = epsilons[t]
            print(f"SMC Population {t+1}/{n_populations}, epsilon = {epsilon:.6f}")
            
            # evaluate particles and calculate distances
            distances = []
            trajectories = []
            for particle in tqdm(particles):
                sim_data = self.simulator_function(particle["alpha"], particle["lmbd"])
                distance = self.calculate_distance(sim_data)
                distances.append(distance)
                if sim_data is not None:
                    trajectories.append(sim_data["H1N1"].copy())
                else:
                    trajectories.append(None)
            
            # update weights based on epsilon
            new_weights = np.zeros(n_particles)
            for i, distance in enumerate(distances):
                if distance < epsilon:
                    new_weights[i] = weights[i]
            
            # normalize weights
            if np.sum(new_weights) > 0:
                new_weights = new_weights / np.sum(new_weights)
            else:
                print(f"No particles accepted at epsilon = {epsilon}. Taking best particles.")
                sorted_indices = np.argsort(distances)
                for i in range(max(1, int(n_particles * 0.1))):
                    new_weights[sorted_indices[i]] = 1.0
                new_weights = new_weights / np.sum(new_weights)
            
            # calculate effective sample size
            ESS = 1.0 / np.sum(new_weights**2)
            print(f"Effective sample size: {ESS:.2f}")
            
            # resample if needed
            if ESS < n_particles / 2 or t == n_populations - 1:
                # resample based on weights
                indices = np.random.choice(n_particles, size=n_particles, p=new_weights)
                resampled_particles = [particles[i] for i in indices]
                resampled_trajectories = [trajectories[i] for i in indices]
                particles = resampled_particles
                trajectories = resampled_trajectories
                weights = np.ones(n_particles) / n_particles
            else:
                weights = new_weights
            
            # if not final iteration, perturb particles
            if t < n_populations - 1:
                # calculate kernel covariance
                alpha_values = np.array([p["alpha"] for p in particles])
                lmbd_values = np.array([p["lmbd"] for p in particles])
                
                params = np.vstack([alpha_values, lmbd_values]).T
                cov = np.cov(params.T) + np.eye(2) * 1e-6  # add small diagonal for stability
                
                # perturb particles
                new_particles = []
                for i, particle in enumerate(particles):
                    accepted = False
                    attempts = 0
                    while not accepted and attempts < 100:
                        attempts += 1
                        # multivariate normal perturbation
                        perturbation = multivariate_normal.rvs(mean=[0, 0], cov=cov)
                        new_alpha = particle["alpha"] + perturbation[0]
                        new_lmbd = particle["lmbd"] + perturbation[1]
                        
                        # ensure being within history matching bounds
                        alpha_in_bounds = alpha_min <= new_alpha <= alpha_max
                        lmbd_in_bounds = lmbd_min <= new_lmbd <= lmbd_max
                        
                        if alpha_in_bounds and lmbd_in_bounds:
                            accepted = True
                            new_particles.append({"alpha": new_alpha, "lmbd": new_lmbd})
                    
                    # if couldn't generate valid particle after max attempts, keep original
                    if not accepted:
                        new_particles.append(particle)
                
                particles = new_particles
        
        # return final particles and weights
        final_results = []
        for i, particle in enumerate(particles):
            final_results.append({
                "alpha": particle["alpha"],
                "lmbd": particle["lmbd"],
                "weight": weights[i],
                "distance": distances[i],
                "trajectory": trajectories[i]
            })
        
        return pd.DataFrame(final_results)
    
    def plot_results(self, results_df, method_name="ABC", n_trajectories=5):
        """Plot parameter posterior and time series comparison"""
        #import arviz as az
    
        if len(results_df) == 0:
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.text(0.5, 0.5, f"No accepted parameter sets for {method_name}", 
               horizontalalignment='center', verticalalignment='center')
            return fig
    
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
        # 1. Parameter distribution plot (scatter)
        if 'weight' in results_df.columns:
            # for SMC with weights
            scatter = axes[0].scatter(results_df["alpha"], results_df["lmbd"], 
                               s=results_df["weight"]*100, alpha=0.6)
        else:
            # for methods without weights
            scatter = axes[0].scatter(results_df["alpha"], results_df["lmbd"], alpha=0.6)
    
        axes[0].set_title(f"Parameter posterior - {method_name}")
        axes[0].set_xlabel("Alpha")
        axes[0].set_ylabel("Lambda")
    
        # 2. Trajectories plot
        # trajectories from results
        n_plot = min(n_trajectories, len(results_df))
        for i in range(n_plot):
            traj = results_df.iloc[i].get("trajectory")
            if traj is not None:
                axes[1].plot(traj, alpha=0.6, label=f"Sim {i+1}")
    
        # observed data
        axes[1].plot(self.observed_data["H1N1"], color="black", linestyle="--", 
                linewidth=2, label="Observed")
    
        axes[1].set_title("Time series comparison")
        axes[1].set_xlabel("Time")
        axes[1].set_ylabel("Infected")
        axes[1].legend()
    
        plt.tight_layout()
        return fig

In [None]:
def generate_synthetic_data(alpha=0.78, lmbd=0.4, days=range(1, 100)):
    """
    Synthetic epidemic data with known parameters as a 'real data'
    """
    pool = Main(
        strains_keys=['H1N1', 'H3N2', 'B'],
        infected_init=[10, 0, 0],
        alpha=[alpha, alpha, alpha],
        lmbd=lmbd
    )
    
    pool.runs_params(
        num_runs=1,
        days=[1, len(days)],
        data_folder='chelyabinsk_10'
    )
    
    pool.age_groups_params(
        age_groups=['0-10', '11-17', '18-59', '60-150'],
        vaccined_fraction=[0, 0, 0, 0]
    )
    
    pool.start(with_seirb=True)
    
    results_path = os.path.join(pool.results_dir, "prevalence_seed_0.csv")
    data = pd.read_csv(results_path, sep='\t')
    
    return data

In [None]:
# define parameter ranges for priors
prior_ranges = {
    "alpha": (0.1, 0.9),   # Susceptibility
    "lmbd": (0.1, 0.9)     # Transmissibility
}

In [None]:
# generate synthetic data with known parameters
true_alpha = 0.78
true_lmbd = 0.4
observed_data = generate_synthetic_data(alpha=true_alpha, lmbd=true_lmbd)

In [None]:
# create ABM instance
abm = ABM_tuned(observed_data)

In [None]:
# run history matching
hm_results = abm.history_matching(prior_ranges, n_samples=500, epsilon=3)

In [None]:
print(len(hm_results))
abm.plot_results(hm_results, "History Matching", n_trajectories=len(hm_results))

In [None]:
# run ABC Rejection
rejection_results = abm.rejection_abc(n_samples=500, epsilon=1e-1)

In [None]:
print(len(rejection_results))
abm.plot_results(rejection_results, "ABC Rejection", len(rejection_results))

In [None]:
# run ABC Annealing
annealing_results = abm.annealing_abc(n_samples=500, initial_epsilon=1e-1, final_epsilon=1e-5, cooling_steps=5)

In [None]:
print(len(annealing_results))
abm.plot_results(annealing_results, "ABC Annealing", len(annealing_results))

In [None]:
# run ABC SMC
smc_results = abm.smc_abc(n_particles=500, n_populations=5, initial_epsilon=1e-1, final_epsilon=1e-5)

In [None]:
print(len(smc_results))
abm.plot_results(smc_results, "ABC SMC", len(smc_results))