In [2]:
import json
import re
import warnings

import pandas as pd
import networkx as nx
import numpy as np

from difflib import SequenceMatcher
from tqdm.notebook import tqdm

In [3]:
warnings.filterwarnings('ignore')

# Bucharest dataset

In [4]:
bucharest = pd.read_excel('../Data/Raw/replication_data_clean_version.xlsx')
bucharest_big = pd.read_excel('../Data/Raw/short database COVID_20.09.21_v3.xlsx')
bucharest_big = bucharest_big[~bucharest_big['covid_patient'].isna()]

In [4]:
bucharest.shape, bucharest_big.shape

((13377, 33), (46269, 40))

In [12]:
bucharest_nodelist = pd.DataFrame(columns=['node_id', 'age', 'gender', 'medical', 'isco08_code', 'isco08_label'])
unique_nodes = set()

for index, row in tqdm(bucharest.iterrows()):
    if (ref := str(row['REFEREE_CODE_HARVARD'])) not in unique_nodes:
        bucharest_nodelist = bucharest_nodelist.append(pd.Series({'node_id': ref, 'age': row['REFEREE_AGE'], 'gender': row['REFEREE_SEX_MALE_1'], 'medical': row['REFEREE_medical_yes_no_not_active'],
                                                                  'isco08_code': row['REFEREE_ISCO08_1_digit_code'], 'isco08_label': row['REFEREE_ISCO08_1_digit_label']}), ignore_index=True)
    if (ref := str(row['REFERRAL_CODE_HARVARD'])) not in unique_nodes:
        bucharest_nodelist = bucharest_nodelist.append(pd.Series({'node_id': ref, 'age': row['REFERRAL_AGE'], 'gender': row['REFERRAL_SEX_1_MALE'], 'medical': row['REFERRAL_medical_yes_no_not_active'],
                                                                  'isco08_code': row['REFERRAL_ISCO08_1_digit_code'], 'isco08_label': row['REFERRAL_ISCO08_1_digit_label']}), ignore_index=True)
    
    unique_nodes.add(str(row['REFEREE_CODE_HARVARD']))
    unique_nodes.add(str(row['REFERRAL_CODE_HARVARD']))

print(bucharest_nodelist.shape[0], len(unique_nodes))

for index, row in tqdm(bucharest_big.iterrows()):
    if (ref := str(int(row['covid_patient']))) not in unique_nodes:
        bucharest_nodelist = bucharest_nodelist.append(pd.Series({'node_id': ref, 'age': row['age'], 'gender': row['sex'], 'medical': np.nan,
                                                                  'isco08_code': row['ISCO08_1_digit_code'], 'isco08_label': row['ISCO08_1_digit_label']}), ignore_index=True)
    
    unique_nodes.add(str(int(row['covid_patient'])))

print(bucharest_nodelist.shape[0], len(unique_nodes))

0it [00:00, ?it/s]

19713 19713


0it [00:00, ?it/s]

57835 57835


**Sanity check**:

- out of 19713 nodes in bucharest, 6895 referees and 1811 referrals are included in bucharest_big
- that means that out of 46269 entries in bucharest_big, 6895 + 1811 = 8706 entries should be skipped -> 46269 - 8706 = 37563
- therefore, the complete nodelist should contain 19713 nodes from bucharest and 37563 from bucharest_big -> 19713 + 37563 = 57276

In [13]:
bucharest_nodelist['node_id'] = bucharest_nodelist['node_id'].astype('str')

In [17]:
type(bucharest_nodelist['medical'].unique()[1])

float

In [22]:
bucharest_nodelist['medical'] = bucharest_nodelist['medical'].apply(lambda x: 'No' if (pd.notna(x)) and (x != 'Yes') else x)
bucharest_nodelist['medical'].unique()

array(['No', nan, 'Yes'], dtype=object)

In [23]:
bucharest_nodelist

Unnamed: 0,node_id,age,gender,medical,isco08_code,isco08_label
0,179373,65.0,1,No,NAP,Not Active - pensioner
1,1S179373,63.0,,,,
2,146179,40.0,1,No,EMP,Employee - unknown group
3,235990,32.0,2.0,,,
4,3S146179,3.0,,No,NAC,Not Active - Child
...,...,...,...,...,...,...
57830,19,32.0,male,,,
57831,18,30.0,female,,,
57832,16,60.0,male,,,
57833,14,42.0,female,,,


In [5]:
#bucharest_edgelist = bucharest.copy()[['NR', 'REFEREE_CONFIRMATION_DAY', 'REFEREE_CODE_HARVARD', 'REFERRAL_CODE_HARVARD']]
#bucharest_edgelist.rename({'NR': 'No', 'REFEREE_CONFIRMATION_DAY': 'Date', 'REFEREE_CODE_HARVARD': 'Referee', 'REFERRAL_CODE_HARVARD': 'Referral'}, axis=1, inplace=True)
bucharest_edgelist = pd.DataFrame(columns=['Referee', 'Referral', 'Date'])

In [None]:
unique_nodes = set()

for index, row in bucharest.iterrows():

    bucharest_edgelist = pd.concat([bucharest_edgelist, pd.Series({'R'})])

In [37]:
referees = set(bucharest['REFEREE_CODE_HARVARD'].astype('str').to_numpy())
referrals = set(bucharest['REFERRAL_CODE_HARVARD'].astype('str').to_numpy())

for index, row in tqdm(bucharest_big.iterrows()):
    covid_patient = str(int(row['covid_patient']))
    if (covid_patient not in referees) and (covid_patient not in referrals):
        bucharest_edgelist = bucharest_edgelist.append(pd.Series({'No': np.nan, 'Date': pd.to_datetime('2020-01-01'), 'Referee': covid_patient, 'Referral': np.nan}), ignore_index=True)

0it [00:00, ?it/s]

In [38]:
bucharest_edgelist.shape

(51499, 4)

**Sanity check:

- bucharest_edgelist is 13377 entries long before
- 8706 entries from bucharest_big are already included in bucharest_edgelist, so don't need to add any non-edges for them
-> need to add 46269 - 8706 = 37563 to edgelist: 13377 + 37563 = 50940

In [39]:
bucharest_edgelist.sort_values(by='Date', ascending=True, inplace=True)
bucharest_edgelist['Date'] = bucharest_edgelist['Date'].apply(lambda x: x.strftime('%y%m%d'))
bucharest_edgelist

Unnamed: 0,No,Date,Referee,Referral
25749,,200101,222208,
34326,,200101,143830,
34327,,200101,143824,
34328,,200101,143823,
34329,,200101,143821,
...,...,...,...,...
12815,13983.0,201031,251363,13983S251363
10440,11400.0,201031,252010,11400S252010
8904,9721.0,201031,252092,9721S252092
5358,5800.0,201031,243980,5800S243980


In [40]:
bucharest_edgelist.to_csv('../Data/Preprocessed/bucharest_edgelist.csv')

In [24]:
bucharest_nodelist.to_csv('../Data/Preprocessed/bucharest_nodelist.csv')

# Yunnan dataset

In [3]:
tmp = pd.read_stata('../Data/Raw/Yunnan & Hainan/Contact Network of Hainan and Yunnan.dta')
yunnan = tmp.loc[tmp['Province'] == 'Yunnan']

In [4]:
yunnan = yunnan[['No', 'Date', 'Age', 'Gender', 'relatives'] + [f'ties{x}' for x in range(1,10)]]

In [5]:
yunnan_nodelist = pd.DataFrame(columns=['node_id', 'age', 'gender', 'relatives'])

In [7]:
unique_nodes = set()
for index, row in yunnan.iterrows():
    if row['No'] not in unique_nodes:
        yunnan_nodelist = yunnan_nodelist.append(pd.Series({'node_id': row['No'], 'age': row['Age'], 'gender': row['Gender'], 'relatives': row['relatives']}), ignore_index=True)

    unique_nodes.add(row['No'])

In [8]:
len(unique_nodes), yunnan.shape

(171, (171, 14))

In [6]:
yunnan_edgelist = pd.DataFrame(columns=['Referee', 'Referral', 'Date'], dtype='int')

In [9]:
for index, row in yunnan.iterrows():
    yunnan_edgelist = yunnan_edgelist.append({'Referee': row['No'], 'Referral': np.nan, 'Date': row['Date']}, ignore_index=True)
    for t in range(1,10):
        if not np.isnan(row[f'ties{t}']):
            yunnan_edgelist = yunnan_edgelist.append({'Referee': row['No'], 'Referral': row[f'ties{t}'], 'Date': row['Date']}, ignore_index=True)

In [10]:
for col in yunnan_edgelist.columns:
    yunnan_edgelist[col] = yunnan_edgelist[col].astype('Int64', errors='ignore')

In [11]:
yunnan_edgelist

Unnamed: 0,Referee,Referral,Date
0,1,,20200117
1,1,10,20200117
2,2,,20200122
3,3,,20200124
4,4,,20200124
...,...,...,...
270,170,156,20200216
271,171,,20200216
272,171,155,20200216
273,171,156,20200216


In [12]:
dt = pd.to_datetime(yunnan_edgelist['Date'], format='%Y%m%d')
yunnan_edgelist['Date'] = dt
yunnan_edgelist.sort_values(by='Date', ascending=True, inplace=True)

In [13]:
yunnan_edgelist

Unnamed: 0,Referee,Referral,Date
0,1,,2020-01-17
1,1,10,2020-01-17
2,2,,2020-01-22
3,3,,2020-01-24
4,4,,2020-01-24
...,...,...,...
273,171,156,2020-02-16
269,170,155,2020-02-16
268,170,,2020-02-16
270,170,156,2020-02-16


In [14]:
yunnan_edgelist['Date'] = yunnan_edgelist['Date'].apply(lambda x: x.strftime('%y%m%d'))

In [15]:
yunnan_edgelist.to_csv('../Data/Preprocessed/yunnan_edgelist.csv')

In [16]:
yunnan_nodelist.to_csv('../Data/Preprocessed/yunnan_nodelist.csv')

# Hainan dataset

In [17]:
hainan = tmp.loc[tmp['Province'] == 'Hainan']

In [18]:
hainan = hainan[['No', 'Date', 'Age', 'Gender', 'relatives'] + [f'ties{x}' for x in range(1,10)]]

In [20]:
hainan_nodelist = pd.DataFrame(columns=['node_id', 'age', 'gender', 'relatives'])

In [21]:
unique_nodes = set()

for index, row in hainan.iterrows():
    if row['No'] not in unique_nodes:
        hainan_nodelist = hainan_nodelist.append(pd.Series({'node_id': row['No'], 'age': row['Age'], 'gender': row['Gender'], 'relatives': row['relatives']}), ignore_index=True)

    unique_nodes.add(row['No'])

In [22]:
len(unique_nodes), hainan.shape

(162, (162, 14))

In [19]:
hainan_edgelist = pd.DataFrame(columns=['Referee', 'Referral', 'Date'], dtype='int')

In [23]:
for index, row in hainan.iterrows():
    hainan_edgelist = hainan_edgelist.append({'Referee': row['No'], 'Referral': np.nan, 'Date': row['Date']}, ignore_index=True)
    for t in range(1,10):
        if not np.isnan(row[f'ties{t}']):
            hainan_edgelist = hainan_edgelist.append({'Referee': row['No'], 'Referral': row[f'ties{t}'], 'Date': row['Date']}, ignore_index=True)

In [24]:
for col in hainan_edgelist.columns:
    hainan_edgelist[col] = hainan_edgelist[col].astype('Int64', errors='ignore')

In [25]:
hainan_edgelist

Unnamed: 0,Referee,Referral,Date
0,3,,20200122
1,1,,20200122
2,4,,20200122
3,2,,20200122
4,7,,20200123
...,...,...,...
279,160,158,20200214
280,159,,20200214
281,159,37,20200214
282,159,76,20200214


In [26]:
hainan_edgelist['Date'] = pd.to_datetime(hainan_edgelist['Date'], format='%Y%m%d')
hainan_edgelist.sort_values(by='Date', ascending=True, inplace=True)
hainan_edgelist['Date'] = hainan_edgelist['Date'].apply(lambda x: x.strftime('%y%m%d'))
hainan_edgelist['Date']

0      200122
1      200122
2      200122
3      200122
7      200123
        ...  
264    200214
263    200214
282    200214
272    200214
283    200214
Name: Date, Length: 284, dtype: object

In [27]:
hainan_edgelist.to_csv('../Data/Preprocessed/hainan_edgelist.csv')

In [28]:
hainan_nodelist.to_csv('../Data/Preprocessed/hainan_nodelist.csv')

# Shanxi dataset

In [29]:
shanxi = pd.read_stata('../Data/Raw/Shanxi/shannxi.dta')

In [30]:
shanxi = shanxi[['No', 'Date', 'Age', 'Gender', 'Hukou', 'relatives'] + [f'ties{x}' for x in range(1,5)]]

In [32]:
shanxi_nodelist = pd.DataFrame(columns=['node_id', 'age', 'gender', 'hukou', 'relatives'])

In [33]:
unique_nodes = set()

for index, row in shanxi.iterrows():
    if row['No'] not in unique_nodes:
        shanxi_nodelist = shanxi_nodelist.append(pd.Series({'node_id': row['No'], 'age': row['Age'], 'gender': row['Gender'], 'relatives': row['relatives'], 'hukou': row['Hukou']}), ignore_index=True)

    unique_nodes.add(row['No'])

In [35]:
len(unique_nodes), shanxi.shape, shanxi_nodelist.shape

(237, (237, 10), (237, 5))

In [36]:
shanxi_edgelist = pd.DataFrame(columns=['Referee', 'Referral', 'Date'], dtype='int')

In [37]:
for index, row in shanxi.iterrows():
    shanxi_edgelist = shanxi_edgelist.append({'Referee': row['No'], 'Referral': np.nan, 'Date': row['Date']}, ignore_index=True)
    
    for t in range(1,5):
        if not np.isnan(row[f'ties{t}']):
            shanxi_edgelist = shanxi_edgelist.append({'Referee': row['No'], 'Referral': row[f'ties{t}'], 'Date': row['Date']}, ignore_index=True)

In [38]:
for col in shanxi_edgelist.columns:
    shanxi_edgelist[col] = shanxi_edgelist[col].astype('Int64', errors='ignore')

In [39]:
shanxi_edgelist

Unnamed: 0,Referee,Referral,Date
0,1,,20200123
1,2,,20200123
2,3,,20200123
3,4,,20200124
4,5,,20200124
...,...,...,...
357,236,235,20200216
358,237,,20200216
359,237,234,20200216
360,237,235,20200216


In [40]:
shanxi_edgelist['Date'] = pd.to_datetime(shanxi_edgelist['Date'], format='%Y%m%d')
shanxi_edgelist.sort_values(by='Date', ascending=True, inplace=True)
shanxi_edgelist['Date'] = shanxi_edgelist['Date'].apply(lambda x: x.strftime('%y%m%d'))
shanxi_edgelist['Date']

0      200123
1      200123
2      200123
3      200124
4      200124
        ...  
356    200216
357    200216
358    200216
359    200216
361    200216
Name: Date, Length: 362, dtype: object

In [41]:
shanxi_edgelist.to_csv('../Data/Preprocessed/shanxi_edgelist.csv')

In [42]:
shanxi_nodelist.to_csv('../Data/Preprocessed/shanxi_nodelist.csv')

# Xian dataset

In [13]:
xian_nodes = pd.read_csv('../Data/Raw/Xian/xian.csv')
xian_edges = pd.read_csv('../Data/Raw/Xian/xian_edges.csv')

In [14]:
xian_edgelist = xian_edges[['Source', 'Target']].rename(columns={'Source': 'Referee', 'Target': 'Referral'})
xian_edgelist

Unnamed: 0,Referee,Referral
0,3,1
1,5,4
2,6,5
3,7,5
4,8,5
...,...,...
754,2027,1792
755,2028,172
756,2035,1792
757,2036,1792


In [15]:
nodes_present = set(xian_edgelist['Referee']).union(set(xian_edgelist['Referral']))
len(nodes_present)

1085

In [16]:
for index, row in xian_nodes.iterrows():
    if row['Id'] in nodes_present:
        continue

    xian_edgelist = pd.concat([xian_edgelist, pd.Series({'Referee': row['Id'], 'Referral': np.nan}).to_frame().T], axis='index', ignore_index=True)

xian_edgelist

Unnamed: 0,Referee,Referral
0,3.0,1.0
1,5.0,4.0
2,6.0,5.0
3,7.0,5.0
4,8.0,5.0
...,...,...
1719,2046.0,
1720,2047.0,
1721,2048.0,
1722,2049.0,


In [17]:
xian_edgelist['Referee'] = xian_edgelist['Referee'].astype('Int64')
xian_edgelist['Referral'] = xian_edgelist['Referral'].astype('Int64')
xian_edgelist

Unnamed: 0,Referee,Referral
0,3,1
1,5,4
2,6,5
3,7,5
4,8,5
...,...,...
1719,2046,
1720,2047,
1721,2048,
1722,2049,


In [18]:
xian_edgelist.to_csv('../Data/Preprocessed/xian_edgelist.csv')

# China dataset

In [43]:
china = pd.read_csv('../Data/Raw/dataset_EN.csv', encoding_errors='replace')

In [44]:
china = china[['ID', 'Date_Disclose', 'Age', 'Gender', 'Place of Residency', 'Place and Event', 'Venue', 'With Whom', 'Contact_ID_Relationship', 'Symptom', 'Symptom_Severity', 'Place_Admission']]

In [45]:
china

Unnamed: 0,ID,Date_Disclose,Age,Gender,Place of Residency,Place and Event,Venue,With Whom,Contact_ID_Relationship,Symptom,Symptom_Severity,Place_Admission
0,Anhui_Anqing-1,2020/1/27,49,Male,,Wuhan Travel,,,,,,Anhui_Anqing
1,Anhui_Anqing-10,2020/1/29,54,Male,,Hainan Work,,,anqing-14 (wife) anqing-10 (husband),Somatosensory Related-Sign Description,Stable,Anhui_Anqing
2,Anhui_Anqing-11,2020/1/29,50,Female,Hubei_Wuhan,Wuhan Travel,,,,Respiratory System Related-Symptom Description,Stable,Anhui_Anqing
3,Anhui_Anqing-12,2020/1/29,46,Male,,Yellowstone Work,,,,Somatosensory Related-Sign Description,Stable,Anhui_Anqing
4,Anhui_Anqing-13,2020/1/29,58,Male,Hubei_Wuhan,Work In Wuhan,,,,,Stable,Anhui_Anqing
...,...,...,...,...,...,...,...,...,...,...,...,...
27199,Zhejiang_Ningbo-256,1/7/2022,,Female,Zhejiang-Ningbo,,,,,,,Zhejiang-Ningbo
27200,Zhejiang_Ningbo-257,1/7/2022,,Female,Zhejiang-Ningbo,,,,,,,Zhejiang-Ningbo
27201,Zhejiang_Ningbo-258,1/7/2022,,Female,Zhejiang-Ningbo,,,,,,,Zhejiang-Ningbo
27202,Chongqing-22,1/8/2022,,,,,,,,,,Chongqing


## Drop entries with invalid date

In [46]:
china['Date_Disclose'] = pd.to_datetime(china['Date_Disclose'], infer_datetime_format=True, errors='coerce')
china = china[~china['Date_Disclose'].isnull()]
china.drop(china['Date_Disclose'][china['Date_Disclose'] > pd.to_datetime('2022/12/01')].index, inplace=True)

## Build edgelist

In [47]:
places = set()
for place in china['Place_Admission'].unique():
    try:
        string = place.replace(' ', '_')
    except AttributeError:
        continue

    string = re.sub('_+', '_', string)
    string = re.sub('^_', '', string)
    string = re.sub('_$', '', string)
    string = re.sub(',', '', string)
    try:
        a, b = string.split('_')
        places.add(a)
        places.add(b)
    except ValueError:
        places.add(string)

In [48]:
china_nodelist = pd.DataFrame(columns=['node_id', 'age', 'gender', 'residency', 'place_event', 'possible_source', 'symptom', 'symptom_severity', 'place_admission'])

In [49]:
china_edgelist = pd.DataFrame(columns=['Referee', 'Referral', 'Date'])

In [50]:
exp = re.compile('(?:\w+\_)?(\w+[-|_]\d+)', re.IGNORECASE) # match abc_def[-|_]012 -> capture def[-|_]012
failed = set()
unique_nodes = set()

for index, row in tqdm(china.iterrows()):
    try:
        row_id = exp.findall(row['ID'])[0].lower().replace('_', '-') # strip first part before _ from ID
    except IndexError:
        print(row['ID'])
        continue

    if row_id not in unique_nodes:
        china_nodelist = china_nodelist.append(pd.Series({'node_id': row_id, 'age': row['Age'], 'gender': row['Gender'], 'residency': row['Place of Residency'], 'place_event': row['Place and Event'],
                                                          'possible_source': row['With Whom'], 'symptom': row['Symptom'], 'symptom_severity': row['Symptom_Severity'], 'place_admission': row['Place_Admission']}), ignore_index=True)
        unique_nodes.add(row_id)

    china_edgelist = china_edgelist.append({'Referee': row_id, 'Referral': np.nan, 'Date': row['Date_Disclose']}, ignore_index=True)

    try:
        contacts = exp.findall(row['Contact_ID_Relationship']) # extract all IDs from contact column
    except TypeError:
        continue
    
    for contact in contacts:
        try:
            match = next(s for s in china['ID'].apply(lambda x: x.lower()) if contact.lower().replace('_', '-') in s) # try to find contact in IDs
            match_id = exp.findall(match)[0].replace('_', '-') # strip first part before _ from ID
            referral = china.loc[china['ID'].apply(lambda x: x.lower()) == match]
            if row_id != match_id: # no self edges
                # edge_list.append((row_id, match))
                china_edgelist = china_edgelist.append({'Referee': row_id, 'Referral': match_id, 'Date': row['Date_Disclose']}, ignore_index=True)

        except StopIteration:
            failed.add((row_id, contact.lower()))

0it [00:00, ?it/s]

Guangdong_Shenzhen
Shandong_Rizhao - 100
Shandong_Rizhao - 101
Shandong_Wingdao - 100


In [51]:
china_edgelist

Unnamed: 0,Referee,Referral,Date
0,anqing-1,,2020-01-27
1,anqing-10,,2020-01-29
2,anqing-10,anqing-14,2020-01-29
3,anqing-11,,2020-01-29
4,anqing-12,,2020-01-29
...,...,...,...
38388,ningbo-256,,2022-01-07
38389,ningbo-257,,2022-01-07
38390,ningbo-258,,2022-01-07
38391,chongqing-22,,2022-01-08


In [53]:
len(unique_nodes), china.shape, china_nodelist.shape, len(failed)

(25877, (26961, 12), (25877, 9), 588)

In [54]:
china_edgelist.sort_values(by='Date', ascending=True, inplace=True)
china_edgelist['Date'] = china_edgelist['Date'].apply(lambda x: x.strftime('%y%m%d'))
china_edgelist

Unnamed: 0,Referee,Referral,Date
3267,beijing-362,,200101
3269,beijing-364,,200101
3270,beijing-365,,200101
3266,beijing-361,,200101
3268,beijing-363,,200101
...,...,...,...
20478,huaian-71,,220728
20477,huaian-70,,220728
20475,huaian-69,,220728
20474,huaian-68,,220728


In [55]:
china_edgelist.to_csv('../Data/Preprocessed/china_edgelist.csv')

In [56]:
china_nodelist.to_csv('../Data/Preprocessed/china_nodelist.csv')