In [1]:
import sys
sys.path.append("../../src/models")

In [2]:
import numpy as np
from actions import city_restrictions, costs

In [3]:
from simulate_pandemic import init_infection, spread_infection, lambda_leak_expose, update_population
from simulate_pandemic import main

In [4]:
from numpy.random import default_rng
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from joblib import Parallel, delayed

## Generate simulation data

In [5]:
def make_individual_df(week, sim, action, data):
    df = pd.DataFrame(data, columns=['id', 'state'])
    df['simulation'] = sim
    df['week'] = week
    df['action'] = action
    
    return df

In [6]:
def make_value_counts_df(week, sim, action, data):
    df = pd.DataFrame(pd.Series(data[:, 1]).value_counts()).T
    df['simulation'] = sim
    df['week'] = week
    df['action'] = action
    df = df.rename(columns = {
                              -1 : 'removed',
                               0 : 'susceptible',
                               1 : 'exposed',
                               2 : 'infected',
                               3 : 'hospitalized'
                              }
                  )
    
    return df

In [7]:
def simulate_and_save(policy_number, policy, n_sims, step_size=7, folder='generated_sims/'):
    dfs = []
    for sim in range(1, n_sims+1):
        
        sim_name = f'{policy_number}_{sim}'
                
        data, pop_matrix = main(gpickle_path=gpickle_path,
                                p_r=p_r,
                                policy=policy,
                                disable_tqdm=True,
                                days=step_size*len(policy),
                                step_size=step_size)
        
        weeks = (step_size/7) * len(policy)
        
        weekly_data = [data[i*7] for i in range(int(weeks))]
        df = pd.concat([make_individual_df(week, sim_name, action, data)
                             for week, (data, action) in enumerate(zip(weekly_data, policy))])
        #df.to_parquet(folder + f'{sim_name}.parquet')
        dfs += [df]
    return dfs

In [8]:
rng = default_rng(None)

gpickle_path = "../../data/processed/SP_multiGraph_Job_Edu_Level.gpickle"

prhome = 0.06
p_r = {
    'home'    :  prhome,
    'neighbor':  .1*prhome,
    'work'    :  .1*prhome,
    'school'  :  .15*prhome,
}

In [9]:
costs_keys = list(costs.keys())
costs_values = np.array(list(costs.values()))


### Cost Weighted

In [10]:
p = (1 - costs_values) / (1 - costs_values).sum()
policy = [rng.choice(costs_keys, size=int(364/14),
                     replace=True, p=p)
          for i in range(16)]

assert len(set([tuple(p) for p in policy])) == len(policy)

In [11]:
n_sims=1
res = Parallel(n_jobs=16)(delayed(simulate_and_save)(i, pol, n_sims) 
                              for i, pol in tqdm(enumerate(policy),
                                                 total=len(policy)))

100%|██████████| 16/16 [00:00<00:00, 101.28it/s]


In [12]:
res_df = pd.concat([r for rr in res for r in rr])

In [13]:
res_df.shape

(23084672, 5)

In [14]:
res_df.to_parquet("simulation_results_dataset.parquet")

## Add individuals data

In [15]:
individual_infos = pd.read_feather("../../data/processed/clusterized_df.feather")

In [17]:
individual_infos

Unnamed: 0,home,school,work,id,work_x,work_y,school_x,school_y,home_x,home_y,...,idade,criteriobr,renda_fa,education,job_level,studies,private_healthcare,home_cluster,work_cluster,school_cluster
0,1.0,,3.0,1,333104.0,7394476.0,,,333743.0,7394463.0,...,59.0,4.0,2732.575910,3.0,2.0,1.0,False,1_1,3_0,
1,1.0,84.0,82.0,2,327503.0,7392159.0,329431.0,7395939.0,333743.0,7394463.0,...,21.0,4.0,2732.575910,4.0,2.0,5.0,False,1_1,82_16,84_1_5
2,1.0,,1.0,3,333453.0,7394501.0,,,333814.0,7394428.0,...,37.0,5.0,3200.000000,4.0,2.0,1.0,False,1_4,1_7,
3,1.0,,1.0,4,333539.0,7394387.0,,,333814.0,7394428.0,...,19.0,5.0,3200.000000,3.0,2.0,1.0,False,1_4,1_7,
4,1.0,,26.0,5,332344.0,7393317.0,,,333814.0,7394428.0,...,18.0,5.0,3200.000000,3.0,2.0,1.0,False,1_4,26_1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55487,342.0,,90.0,55488,327737.0,7396374.0,,,323322.0,7392038.0,...,27.0,1.0,9461.630458,5.0,4.0,1.0,True,342_10,90_0,
55488,342.0,,341.0,55489,323653.0,7391801.0,,,322857.0,7390956.0,...,52.0,1.0,9461.630458,5.0,2.0,1.0,True,342_6,341_0,
55489,342.0,,341.0,55490,323653.0,7391801.0,,,322857.0,7390956.0,...,26.0,1.0,9461.630458,5.0,2.0,1.0,True,342_6,341_0,
55490,342.0,,342.0,55491,323034.0,7390476.0,,,323034.0,7390476.0,...,28.0,3.0,3000.000000,5.0,2.0,1.0,False,342_3,342_1,


In [36]:
dataset_info = pd.merge(
    res_df,
    individual_infos[['id', 'home', 'school', 'work', 'idade', 'home_id']].astype(float),
    on='id'
).copy()

## Creating Target (State two weeks from now)

In [37]:
dataset_info['target'] = dataset_info.groupby(['id', 'simulation'])['state'].shift(-1)

In [38]:
dataset_info.groupby(['week'])['state'].apply(lambda x: (x == 0).sum() / len(x)).mean()

0.8162256756344644

### Discarding weeks 17 onwards, due to 30% removed

In [39]:
dataset_info = dataset_info[dataset_info['week'] < 17]

In [40]:
dataset_info.target.isna().sum()

0

In [64]:
dataset_info.query("id == 1936 and week == 12 and simulation == '0_1'")

Unnamed: 0,id,state,simulation,week,action,home,school,work,idade,home_id,target
804972,1936,0,0_1,12,Light Quarantine,17.0,,,37.0,170031.0,0


In [41]:
dataset_info['target'] = dataset_info['target'].astype(int)

In [42]:
def create_columns_map(relation_type):
    return {
        'week': 'actual_week',
        -1: f'{relation_type}_removed',
        0: f'{relation_type}_susceptible',
        1: f'{relation_type}_exposed',
        2: f'{relation_type}_infected',
        3: f'{relation_type}_hospitalized'
    }

In [43]:
resulting_ratios = {}

for relation_type in ['home', 'school', 'work', 'home_id']:
    print(relation_type)
    value_counts_week = (
        dataset_info
            .groupby(['week', 'simulation', relation_type])['target']
            .value_counts(normalize=True)
            .reset_index(name='ratio')
    )
    
    print("value counts done")
    pivot = (
        pd.pivot_table(data=value_counts_week, index=['week', 'simulation', relation_type],
                       columns='target', values='ratio', fill_value=0)
          .reset_index()
          .rename(columns=create_columns_map(relation_type))
    )
    
    print("Pivot Done")
   
    pivot['week'] = pivot['actual_week'] + 1
    
    resulting_ratios[relation_type] = pivot

home
value counts done
Pivot Done
school
value counts done
Pivot Done
work
value counts done
Pivot Done
home_id
value counts done
Pivot Done


In [65]:
model_dataset = dataset_info[(dataset_info['target'].notna()) & (dataset_info['state'] == 0)].copy()

In [66]:
model_dataset.shape[0] / dataset_info.shape[0]

0.9156939951068729

In [67]:
model_dataset.query("id == 1936 and week == 12 and simulation == '0_1'")

Unnamed: 0,id,state,simulation,week,action,home,school,work,idade,home_id,target
804972,1936,0,0_1,12,Light Quarantine,17.0,,,37.0,170031.0,0


In [46]:
before_shape = model_dataset.shape[0]

In [69]:
model_dataset = pd.merge(
    model_dataset,
    resulting_ratios['home'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'home'],
    how='left'
).merge(
    resulting_ratios['work'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'work'],
    how='left'
).merge(
    resulting_ratios['school'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'school'],
    how='left'
).merge(
    resulting_ratios['home_id'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'home_id'],
    how='left'
)

In [48]:
assert model_dataset.shape[0] == before_shape

In [70]:
model_dataset['numerical_actions'] = model_dataset['action'].map({
    'Unrestricted': 0,
    'Social Distancing':1,
    'Light Quarantine': 2,
    'Hard Quarantine':3,
    'Lockdown':4
})

Should we consider only target state 1 or any target state different than 0? i.e., the person can be hospitalized, should also count as a one.

In [71]:
model_dataset['binary_target'] = (model_dataset['target'] != 0).astype(int)

In [72]:
model_dataset['train_test'] = list(map({True: 'train', False: 'test'}.get, (model_dataset['week'] < 11)))

In [73]:
model_dataset.groupby(['train_test', 'week'])['binary_target'].mean().mean()

0.01977414714175631

In [75]:
model_dataset.reset_index(drop=True).to_parquet("model_dataset.parquet")

In [76]:
model_dataset.shape

(13821324, 34)