In [1]:
import sys
sys.path.append("../../src/models")

In [2]:
import numpy as np
from actions import city_restrictions, costs

In [3]:
from simulate_pandemic import init_infection, spread_infection, lambda_leak_expose, update_population
from simulate_pandemic import main

In [4]:
from numpy.random import default_rng
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [5]:
from joblib import Parallel, delayed

In [6]:
individual_infos = pd.read_feather("../../data/interim/work_school_home_sp_esc.feather")

In [7]:
individual_infos['id'] = individual_infos.index + 1

In [8]:
individual_infos.head(10)

Unnamed: 0,home,school,work,id,work_x,work_y,school_x,school_y,home_x,home_y,home_id,idade,criteriobr,renda_fa,education,job_level,studies,private_healthcare
0,1.0,,3.0,1,333104.0,7394476.0,,,333743.0,7394463.0,10001,59.0,4.0,2732.57591,3.0,2.0,1.0,False
1,1.0,84.0,82.0,2,327503.0,7392159.0,329431.0,7395939.0,333743.0,7394463.0,10001,21.0,4.0,2732.57591,4.0,2.0,5.0,False
2,1.0,,1.0,3,333453.0,7394501.0,,,333814.0,7394428.0,10002,37.0,5.0,3200.0,4.0,2.0,1.0,False
3,1.0,,1.0,4,333539.0,7394387.0,,,333814.0,7394428.0,10002,19.0,5.0,3200.0,3.0,2.0,1.0,False
4,1.0,,26.0,5,332344.0,7393317.0,,,333814.0,7394428.0,10002,18.0,5.0,3200.0,3.0,2.0,1.0,False
5,1.0,,,6,,,,,333814.0,7394428.0,10002,43.0,5.0,3200.0,2.0,,1.0,False
6,1.0,,2.0,7,333585.0,7394842.0,,,333745.0,7394553.0,10003,36.0,4.0,7600.0,2.0,2.0,1.0,True
7,1.0,,2.0,8,333494.0,7395038.0,,,333745.0,7394553.0,10003,26.0,4.0,7600.0,3.0,2.0,1.0,True
8,1.0,,,9,,,,,333743.0,7394463.0,10005,67.0,5.0,1250.0,5.0,,1.0,False
9,1.0,,54.0,10,333029.0,7389527.0,,,333818.0,7394428.0,10008,42.0,3.0,3579.577301,4.0,2.0,1.0,False


In [9]:
def make_individual_df(week, sim, action, data):
    df = pd.DataFrame(data, columns=['id', 'state'])
    df['simulation'] = sim
    df['week'] = week
    df['action'] = action
    
    return df

In [10]:
def make_value_counts_df(week, sim, action, data):
    df = pd.DataFrame(pd.Series(data[:, 1]).value_counts()).T
    df['simulation'] = sim
    df['week'] = week
    df['action'] = action
    df = df.rename(columns = {
                              -1 : 'removed',
                               0 : 'susceptible',
                               1 : 'exposed',
                               2 : 'infected',
                               3 : 'hospitalized'
                              }
                  )
    
    return df

In [11]:
def simulate_and_save(policy_number, policy, n_sims, step_size=7, folder='generated_sims/'):
    dfs = []
    for sim in range(1, n_sims+1):
        
        sim_name = f'{policy_number}_{sim}'
                
        data, pop_matrix = main(gpickle_path=gpickle_path,
                                p_r=p_r,
                                policy=policy,
                                disable_tqdm=True,
                                days=step_size*len(policy),
                                step_size=step_size)
        
        weeks = (step_size/7) * len(policy)
        
        weekly_data = [data[i*7] for i in range(int(weeks))]
        df = pd.concat([make_individual_df(week, sim_name, action, data)
                             for week, (data, action) in enumerate(zip(weekly_data, policy))])
        #df.to_parquet(folder + f'{sim_name}.parquet')
        dfs += [df]
    return dfs

In [12]:
rng = default_rng(None)

gpickle_path = "../../data/processed/SP_multiGraph_Job_Edu_Level.gpickle"

prhome = 0.06
p_r = {
    'home'    :  prhome,
    'neighbor':  .1*prhome,
    'work'    :  .1*prhome,
    'school'  :  .15*prhome,
}

In [13]:
costs_keys = list(costs.keys())
costs_values = np.array(list(costs.values()))


### Cost Weighted

In [14]:
p = (1 - costs_values) / (1 - costs_values).sum()
policy = [rng.choice(costs_keys, size=int(364/14),
                     replace=True, p=p)
          for i in range(16)]

assert len(set([tuple(p) for p in policy])) == len(policy)

In [None]:
n_sims=1
res = Parallel(n_jobs=16)(delayed(simulate_and_save)(i, pol, n_sims) 
                              for i, pol in tqdm(enumerate(policy),
                                                 total=len(policy)))

100%|██████████| 16/16 [00:00<00:00, 75.38it/s]


In [None]:
res_df = pd.concat([r for rr in res for r in rr])

In [None]:
res_df.shape

In [None]:
res_df.head(5)

In [None]:
individual_infos.tail(10)

In [None]:
res_df['id'].max()

In [None]:
dataset_info = pd.merge(
    res_df,
    individual_infos[['id', 'home', 'school', 'work', 'idade', 'home_id']].astype(float),
    on='id'
).copy()

In [None]:
dataset_info.head(1)

In [None]:
dataset_info['target'] = dataset_info.groupby(['id', 'simulation'])['state'].shift(-1)

In [None]:
## Removing those without target & already removed (state == -1) at start

In [None]:
dataset_info.groupby("week")['target'].apply(lambda x: x.isna().sum())

In [None]:
dataset_info.groupby(['week'])['state'].apply(lambda x: (x == -1).sum() / len(x))

In [None]:
### Discarding weeks 16 onwards, due to more than 20% removed

In [None]:
dataset_info = dataset_info[dataset_info['week'] < 16]

In [None]:
dataset_info.target.isna().sum()

In [None]:
dataset_info['target'] = dataset_info['target'].astype(int)

In [None]:
def create_columns_map(relation_type):
    return {
        'week': 'actual_week',
        -1: f'{relation_type}_removed',
        0: f'{relation_type}_susceptible',
        1: f'{relation_type}_exposed',
        2: f'{relation_type}_infected',
        3: f'{relation_type}_hospitalized'
    }

In [None]:
resulting_ratios = {}

for relation_type in ['home', 'school', 'work', 'home_id']:
    print(relation_type)
    value_counts_week = (
        dataset_info
            .groupby(['week', 'simulation', relation_type])['target']
            .value_counts(normalize=True)
            .reset_index(name='ratio')
    )
    
    print("value counts done")
    pivot = (
        pd.pivot_table(data=value_counts_week, index=['week', 'simulation', relation_type],
                       columns='target', values='ratio', fill_value=0, )
          .reset_index()
          .rename(columns=create_columns_map(relation_type))
    )
    
    print("Pivot Done")
   
    pivot['week'] = pivot['actual_week'] + 1
    
    resulting_ratios[relation_type] = pivot

In [None]:
resulting_ratios.keys()

In [None]:
model_dataset = dataset_info[(dataset_info['target'].notna()) & dataset_info['state'] == 0].copy()

In [None]:
model_dataset.shape[0] / dataset_info.shape[0]

In [None]:
model_dataset = pd.merge(
    model_dataset,
    resulting_ratios['home'],
    on=['week', 'simulation', 'home']
).merge(
    resulting_ratios['work'],
    on=['week', 'simulation', 'work']
).merge(
    resulting_ratios['school'],
    on=['week', 'simulation', 'school']
).merge(
    resulting_ratios['home_id'],
    on=['week', 'simulation', 'home_id']
)

In [59]:
model_dataset.head(2)

Unnamed: 0,id,state,simulation,week,action,home,school,work,idade,home_id,...,school_infected,school_hospitalized,actual_week_y,home_id_removed,home_id_susceptible,home_id_exposed,home_id_infected,home_id_hospitalized,numerical_actions,binary_target
0,270,0,0_1,1,Social Distancing,3.0,24.0,3.0,37.0,30753.0,...,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,1,0
1,3782,0,0_1,1,Social Distancing,28.0,24.0,82.0,25.0,283697.0,...,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,1,0


In [None]:
model_dataset.columns.tolist()

In [None]:
model_dataset['numerical_actions'] = model_dataset['action'].map({
    'Unrestricted': 0,
    'Social Distancing':1,
    'Light Quarantine': 2,
    'Hard Quarantine':3,
    'Lockdown':4
})

In [None]:
features = [
    'numerical_actions',
    'home_removed',
    'home_susceptible',
    'home_exposed',
    'home_infected',
    'home_hospitalized',
    'work_removed',
    'work_susceptible',
    'work_exposed',
    'work_infected',
    'work_hospitalized',
    'school_removed',
    'school_susceptible',
    'school_exposed',
    'school_infected',
    'school_hospitalized',
    'home_id_removed',
    'home_id_susceptible',
    'home_id_exposed',
    'home_id_infected',
    'home_id_hospitalized'
]

In [None]:
model_dataset['binary_target'] = (model_dataset['target'] == 1).astype(int)

In [None]:
train = model_dataset[model_dataset['week'] < 12].copy()
test = model_dataset[model_dataset['week'] >= 12].copy()

In [None]:
train['binary_target'].value_counts(normalize=True), test['binary_target'].value_counts(normalize=True)

In [None]:
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score

default_params = {
    "n_jobs": 16,
    "n_estimators": 100, 
    "objective": 'binary',
    'verbosity': -1,
    "metric": 'binary_logloss',
    'boosting_type': 'gbdt'
}

base_lgbm = lgbm.LGBMClassifier(**default_params)

In [None]:
features_without_action = [f for f in features if f != 'numerical_actions']
clf = base_lgbm.fit(train[features_without_action], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[features_without_action])[:, 1])

In [68]:
clf = base_lgbm.fit(train[features], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[features])[:, 1])

0.7572184493867181

In [61]:
pd.DataFrame(clf.feature_importances_, index=features).sort_values(0, ascending=False)

Unnamed: 0,0
work_infected,236
school_exposed,229
school_infected,211
home_infected,204
work_exposed,195
home_exposed,192
home_removed,181
work_susceptible,178
home_susceptible,166
school_removed,165


## Loading Graph

In [None]:
import networkx as nx

In [None]:
G = nx.read_gpickle(gpickle_path)

## Topological Features

In [None]:
deg_cent = nx.degree_centrality(G)

In [None]:
eigenvector_cent = nx.centrality.eigenvector_centrality_numpy(G)

In [69]:
train['degree_cent'] = train['id'].map(deg_cent)
test['degree_cent'] = test['id'].map(deg_cent)

train['eigen_cent'] = train['id'].map(eigenvector_cent)
test['eigen_cent'] = test['id'].map(eigenvector_cent)

In [70]:
features_topological = ['degree_cent', 'eigen_cent'] + features
clf = base_lgbm.fit(train[features_topological], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[features_topological])[:, 1])

0.7820058111727032

In [71]:
pd.DataFrame(clf.feature_importances_, index=features_topological).sort_values(0, ascending=False)

Unnamed: 0,0
degree_cent,293
eigen_cent,213
school_infected,192
work_infected,190
school_exposed,178
work_exposed,170
work_susceptible,164
home_exposed,164
home_infected,154
work_removed,147


## Scenario where only hospitalized are seen

In [72]:
unseen = [
    f for f in features_topological 
    if 'exposed' not in f 
    and 'susceptible' not in f
    and 'removed' not in f
    and 'infected' not in f
]
unseen_no_graph = [f for f in unseen if 'cent' not in f]

In [73]:
clf = base_lgbm.fit(train[unseen], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[unseen])[:, 1])

0.6591599883605641

In [74]:
pd.DataFrame(clf.feature_importances_, index=unseen).sort_values(0, ascending=False)

Unnamed: 0,0
degree_cent,774
eigen_cent,730
home_hospitalized,504
work_hospitalized,470
numerical_actions,293
school_hospitalized,196
home_id_hospitalized,33


In [75]:
clf = base_lgbm.fit(train[unseen_no_graph], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[unseen_no_graph])[:, 1])

0.6242559057042177

In [76]:
pd.DataFrame(clf.feature_importances_, index=unseen_no_graph).sort_values(0, ascending=False)

Unnamed: 0,0
home_hospitalized,1206
work_hospitalized,979
school_hospitalized,450
numerical_actions,302
home_id_hospitalized,63


## Node to Vec

In [78]:
from node2vec import Node2Vec

In [80]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G, dimensions=8, walk_length=5, num_walks=20, workers=1)  # Use temp_folder for big graphs

# Embed nodes
model = node2vec.fit(window=4, min_count=1, )#batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

Computing transition probabilities:   0%|          | 0/55492 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 20/20 [00:52<00:00,  2.65s/it]


In [81]:
index_to_key = model.wv.index_to_key

In [82]:
#model_dataset['id'].apply()
normed_vectors = model.wv.get_normed_vectors()

In [83]:
normed_vectors.shape

(55492, 8)

In [84]:
n2v_features = [f"n2v_{i}" for i in range(normed_vectors.shape[1])]

In [85]:
n2v_embd = pd.DataFrame(normed_vectors, columns=n2v_features)
n2v_embd['id'] = pd.Series(model.wv.index_to_key).astype(int)

In [86]:
train = pd.merge(
    train, 
    n2v_embd,
    on='id'
)

test = pd.merge(
    test, 
    n2v_embd,
    on='id'
)

In [87]:
clf = base_lgbm.fit(train[unseen_no_graph], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[unseen_no_graph])[:, 1])

0.6252929158240018

In [88]:
clf = base_lgbm.fit(train[unseen], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[unseen])[:, 1])

0.6593003990117399

In [89]:
clf = base_lgbm.fit(train[unseen+n2v_features], train['binary_target'])
roc_auc_score(test['binary_target'], clf.predict_proba(test[unseen+n2v_features])[:, 1])

0.6601968679283511

In [90]:
pd.DataFrame(clf.feature_importances_, index=unseen+n2v_features).sort_values(0, ascending=False)

Unnamed: 0,0
home_hospitalized,294
degree_cent,277
numerical_actions,273
work_hospitalized,260
n2v_5,240
n2v_1,217
eigen_cent,216
n2v_0,209
n2v_2,188
n2v_4,185
