In [1]:
import sys
sys.path.append("../../src/models")

In [2]:
import numpy as np
from actions import city_restrictions, costs

In [3]:
from simulate_pandemic import init_infection, spread_infection, lambda_leak_expose, update_population
from simulate_pandemic import main

In [4]:
from numpy.random import default_rng
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from joblib import Parallel, delayed

## Generate simulation data

In [5]:
def make_individual_df(week, sim, action, data):
    df = pd.DataFrame(data, columns=['id', 'state'])
    df['simulation'] = sim
    df['week'] = week
    df['action'] = action
    
    return df

In [6]:
def make_value_counts_df(week, sim, action, data):
    df = pd.DataFrame(pd.Series(data[:, 1]).value_counts()).T
    df['simulation'] = sim
    df['week'] = week
    df['action'] = action
    df = df.rename(columns = {
                              -1 : 'removed',
                               0 : 'susceptible',
                               1 : 'exposed',
                               2 : 'infected',
                               3 : 'hospitalized'
                              }
                  )
    
    return df

In [7]:
def simulate_and_save(policy_number, policy, n_sims, step_size=7, folder='generated_sims/'):
    dfs = []
    for sim in range(1, n_sims+1):
        
        sim_name = f'{policy_number}_{sim}'
                
        data, pop_matrix = main(gpickle_path=gpickle_path,
                                p_r=p_r,
                                policy=policy,
                                disable_tqdm=True,
                                days=step_size*len(policy),
                                step_size=step_size)
        
        weeks = (step_size/7) * len(policy)
        
        weekly_data = [data[i*7] for i in range(int(weeks))]
        df = pd.concat([make_individual_df(week, sim_name, action, data)
                             for week, (data, action) in enumerate(zip(weekly_data, policy))])
        #df.to_parquet(folder + f'{sim_name}.parquet')
        dfs += [df]
    return dfs

In [8]:
rng = default_rng(None)

gpickle_path = "../../data/processed/SP_multiGraph_Job_Edu_Level.gpickle"

prhome = 0.06
p_r = {
    'home'    :  prhome,
    'neighbor':  .1*prhome,
    'work'    :  .1*prhome,
    'school'  :  .15*prhome,
}

In [9]:
costs_keys = list(costs.keys())
costs_values = np.array(list(costs.values()))


### Cost Weighted

In [10]:
p = (1 - costs_values) / (1 - costs_values).sum()
policy = [rng.choice(costs_keys, size=int(364/14),
                     replace=True, p=p)
          for i in range(16)]

assert len(set([tuple(p) for p in policy])) == len(policy)

In [11]:
n_sims=1
res = Parallel(n_jobs=16)(delayed(simulate_and_save)(i, pol, n_sims) 
                              for i, pol in tqdm(enumerate(policy),
                                                 total=len(policy)))

100%|██████████| 16/16 [00:00<00:00, 101.29it/s]


In [12]:
res_df = pd.concat([r for rr in res for r in rr])

In [13]:
res_df.shape

(23084672, 5)

In [42]:
res_df.to_parquet("simulation_results_dataset.parquet")

## Add individuals data

In [14]:
individual_infos = pd.read_feather("../../data/interim/work_school_home_sp_esc.feather")

In [15]:
individual_infos['id'] = individual_infos.index + 1

In [16]:
dataset_info = pd.merge(
    res_df,
    individual_infos[['id', 'home', 'school', 'work', 'idade', 'home_id']].astype(float),
    on='id'
).copy()

## Creating Target (State two weeks from now)

In [17]:
dataset_info['target'] = dataset_info.groupby(['id', 'simulation'])['state'].shift(-1)

In [18]:
dataset_info.groupby(['week'])['state'].apply(lambda x: (x == -1).sum() / len(x))

week
0     0.000000
1     0.000091
2     0.000641
3     0.001653
4     0.003345
5     0.005817
6     0.009937
7     0.015996
8     0.024897
9     0.036642
10    0.051450
11    0.069498
12    0.093423
13    0.124035
14    0.158909
15    0.195266
16    0.231863
17    0.268399
18    0.302933
19    0.335357
20    0.363060
21    0.385925
22    0.405890
23    0.423491
24    0.438070
25    0.450042
Name: state, dtype: float64

### Discarding weeks 17 onwards, due to 30% removed

In [19]:
dataset_info = dataset_info[dataset_info['week'] < 17]

In [20]:
dataset_info.target.isna().sum()

0

In [22]:
dataset_info['target'] = dataset_info['target'].astype(int)

In [23]:
def create_columns_map(relation_type):
    return {
        'week': 'actual_week',
        -1: f'{relation_type}_removed',
        0: f'{relation_type}_susceptible',
        1: f'{relation_type}_exposed',
        2: f'{relation_type}_infected',
        3: f'{relation_type}_hospitalized'
    }

In [24]:
resulting_ratios = {}

for relation_type in ['home', 'school', 'work', 'home_id']:
    print(relation_type)
    value_counts_week = (
        dataset_info
            .groupby(['week', 'simulation', relation_type])['target']
            .value_counts(normalize=True)
            .reset_index(name='ratio')
    )
    
    print("value counts done")
    pivot = (
        pd.pivot_table(data=value_counts_week, index=['week', 'simulation', relation_type],
                       columns='target', values='ratio', fill_value=0)
          .reset_index()
          .rename(columns=create_columns_map(relation_type))
    )
    
    print("Pivot Done")
   
    pivot['week'] = pivot['actual_week'] + 1
    
    resulting_ratios[relation_type] = pivot

home
value counts done
Pivot Done
school
value counts done
Pivot Done
work
value counts done
Pivot Done
home_id
value counts done
Pivot Done


In [25]:
resulting_ratios.keys()

dict_keys(['home', 'school', 'work', 'home_id'])

In [26]:
model_dataset = dataset_info[(dataset_info['target'].notna()) & (dataset_info['state'] == 0)].copy()

In [27]:
model_dataset.shape[0] / dataset_info.shape[0]

0.9113703061596584

In [28]:
model_dataset = pd.merge(
    model_dataset,
    resulting_ratios['home'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'home']
).merge(
    resulting_ratios['work'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'work']
).merge(
    resulting_ratios['school'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'school']
).merge(
    resulting_ratios['home_id'].drop("actual_week", axis=1),
    on=['week', 'simulation', 'home_id']
)

In [29]:
model_dataset['numerical_actions'] = model_dataset['action'].map({
    'Unrestricted': 0,
    'Social Distancing':1,
    'Light Quarantine': 2,
    'Hard Quarantine':3,
    'Lockdown':4
})

Should we consider only target state 1 or any target state different than 0? i.e., the person can be hospitalized, should also count as a one.

In [38]:
model_dataset['binary_target'] = (model_dataset['target'] == 1).astype(int)

In [36]:
model_dataset['train_test'] = list(map({True: 'train', False: 'test'}.get, (model_dataset['week'] < 11)))

In [39]:
model_dataset.groupby(['train_test', 'week'])['binary_target'].mean()

train_test  week
test        11      0.056490
            12      0.056132
            13      0.052174
            14      0.050541
            15      0.051214
            16      0.045794
train       1       0.002479
            2       0.003297
            3       0.006110
            4       0.010690
            5       0.016711
            6       0.019256
            7       0.021885
            8       0.027184
            9       0.036520
            10      0.051222
Name: binary_target, dtype: float64

In [40]:
model_dataset.reset_index(drop=True).to_parquet("model_dataset.parquet")

In [41]:
model_dataset.shape

(431921, 34)