In [3]:
import pandas as pd
import networkx as nx
import numpy as np
from difflib import SequenceMatcher
import json
import re

# Case contact networks

In [2]:
yunnan_hainan = pd.read_stata('./Data/Yunnan & Hainan/Contact Network of Hainan and Yunnan.dta')
yunnan_hainan

Unnamed: 0,Date,No,Noingroup,Gender,Age,Arrivedate,Feverdate,Diagnosedate,Strangers,weakties,...,arrivediagtime,source,mage,mfever,mdiag,WeightedDegree,clustering,triangles,eigencentrality,Province
0,20200117,1,1,1.0,,,,,0.0,0.0,...,,,,,,,,,,Yunnan
1,20200122,2,1,1.0,53.0,20200116.0,20200116.0,20200122.0,,,...,,1.0,53.000000,,6.0,,,,,Yunnan
2,20200124,3,1,0.0,39.0,20200115.0,,20200122.0,1.0,1.0,...,,1.0,40.666668,,,,,,,Yunnan
3,20200124,4,2,0.0,34.0,20200120.0,,20200123.0,,,...,,1.0,40.666668,,,,,,,Yunnan
4,20200124,5,3,1.0,49.0,20200121.0,,20200123.0,,,...,,1.0,40.666668,,,,,,,Yunnan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,20200214,161,4,0.0,58.0,20200125.0,,20200125.0,1.0,1.0,...,0.0,1.0,49.000000,66.0,12.0,3.0,1.0,3.0,0.056348,Hainan
329,20200214,158,1,1.0,62.0,,,20200208.0,0.0,0.0,...,,0.0,49.000000,66.0,12.0,6.0,1.0,15.0,1.000000,Hainan
330,20200214,160,3,0.0,25.0,,,20200208.0,0.0,0.0,...,,0.0,49.000000,66.0,12.0,6.0,1.0,15.0,1.000000,Hainan
331,20200214,159,2,0.0,27.0,20200125.0,,20200125.0,1.0,1.0,...,0.0,1.0,49.000000,66.0,12.0,3.0,1.0,3.0,0.056348,Hainan


In [3]:
yunnan = yunnan_hainan.loc[yunnan_hainan['Province'] == 'Yunnan']
hainan = yunnan_hainan.loc[yunnan_hainan['Province'] == 'Hainan']

## Yunnan Dataset

In [4]:
yunnan.columns

Index(['Date', 'No', 'Noingroup', 'Gender', 'Age', 'Arrivedate', 'Feverdate',
       'Diagnosedate', 'Strangers', 'weakties', 'strongties', 'relatives',
       'ties1', 'ties2', 'ties3', 'ties4', 'ties5', 'ties6', 'ties7', 'ties8',
       'ties9', 'Degree', 'Weighteddegree', 'modularity_class', 'pageranks',
       'componentnumber', 'Eccentricity', 'closnesscentrality',
       'harmonicclosnesscentrality', 'betweenesscentrality', 'year0', 'month0',
       'day0', 'date1', 'year', 'month', 'day', 'feverdate1', 'year1',
       'month1', 'day1', 'arrivedate1', 'year2', 'month2', 'day2', 'diagdate',
       'fevertime', 'diagtime', 'arrivediagtime', 'source', 'mage', 'mfever',
       'mdiag', 'WeightedDegree', 'clustering', 'triangles', 'eigencentrality',
       'Province'],
      dtype='object')

In [5]:
yunnan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171 entries, 0 to 170
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Date                        171 non-null    int32         
 1   No                          171 non-null    int16         
 2   Noingroup                   171 non-null    int8          
 3   Gender                      120 non-null    float64       
 4   Age                         119 non-null    float64       
 5   Arrivedate                  90 non-null     float64       
 6   Feverdate                   31 non-null     float64       
 7   Diagnosedate                119 non-null    float64       
 8   Strangers                   60 non-null     float64       
 9   weakties                    60 non-null     float64       
 10  strongties                  60 non-null     float64       
 11  relatives                   60 non-null     float64       

In [6]:
yunnan['Date'].min(), yunnan['Date'].max()

(20200117, 20200216)

In [7]:
yunnan['Degree'].value_counts(), yunnan['Degree'].describe()

(0     114
 1      25
 12     13
 2      11
 3       6
 5       1
 4       1
 Name: Degree, dtype: int64,
 count    171.000000
 mean       1.345029
 std        3.186962
 min        0.000000
 25%        0.000000
 50%        0.000000
 75%        1.000000
 max       12.000000
 Name: Degree, dtype: float64)

In [8]:
yunnan['modularity_class'].value_counts().loc[yunnan['modularity_class'].value_counts() > 1]

125    13
86      6
131     4
115     3
129     3
109     3
80      3
13      2
103     2
113     2
107     2
110     2
78      2
0       2
72      2
127     2
73      2
120     2
Name: modularity_class, dtype: int64

In [14]:
yunnan[~yunnan['relatives'].isnull()]['relatives'].value_counts()

1.0    31
0.0    29
Name: relatives, dtype: int64

In [19]:
yunnan[~yunnan['Strangers'].isnull()][['relatives', 'Strangers', 'Degree']]

Unnamed: 0,relatives,Strangers,Degree
0,1.0,0.0,1
2,0.0,1.0,0
14,1.0,1.0,1
15,1.0,1.0,1
70,0.0,1.0,0
75,0.0,1.0,1
81,1.0,1.0,1
82,1.0,1.0,1
84,0.0,1.0,2
85,0.0,1.0,2


In [10]:
yunnan['Gender'].value_counts()

1.0    62
0.0    58
Name: Gender, dtype: int64

In [11]:
yunnan['Age'].describe()

count    119.000000
mean      41.092437
std       18.316424
min        3.000000
25%       26.000000
50%       40.000000
75%       54.000000
max       79.000000
Name: Age, dtype: float64

In [23]:
yunnan['Feverdate'].value_counts()

20200205.0    3
20200130.0    3
20200131.0    2
20200204.0    2
20200127.0    2
20200126.0    2
20200215.0    2
20200129.0    2
20200128.0    2
20200125.0    2
20200201.0    1
20200117.0    1
20200121.0    1
20200206.0    1
20200209.0    1
20200210.0    1
20200208.0    1
20200211.0    1
20200116.0    1
Name: Feverdate, dtype: int64

## Hainan Dataset

In [34]:
hainan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 171 to 332
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Date                        162 non-null    int32         
 1   No                          162 non-null    int16         
 2   Noingroup                   162 non-null    int8          
 3   Gender                      162 non-null    float64       
 4   Age                         162 non-null    float64       
 5   Arrivedate                  135 non-null    float64       
 6   Feverdate                   32 non-null     float64       
 7   Diagnosedate                147 non-null    float64       
 8   Strangers                   162 non-null    float64       
 9   weakties                    162 non-null    float64       
 10  strongties                  162 non-null    float64       
 11  relatives                   162 non-null    float64     

In [24]:
hainan['Date'].min(), hainan['Date'].max()

(20200122, 20200214)

In [25]:
hainan['Degree'].value_counts(), hainan['Degree'].describe()

(0    71
 1    27
 2    21
 3    21
 5     8
 6     8
 4     6
 Name: Degree, dtype: int64,
 count    162.000000
 mean       1.506173
 std        1.791099
 min        0.000000
 25%        0.000000
 50%        1.000000
 75%        3.000000
 max        6.000000
 Name: Degree, dtype: float64)

In [32]:
114/171, 71/162

(0.6666666666666666, 0.4382716049382716)

In [26]:
hainan['modularity_class'].value_counts().loc[hainan['modularity_class'].value_counts() > 1]

43    9
83    7
48    6
96    6
61    4
31    4
75    4
47    4
53    4
76    4
52    3
46    3
28    3
45    3
68    3
27    2
37    2
38    2
51    2
58    2
65    2
24    2
72    2
63    2
3     2
69    2
8     2
Name: modularity_class, dtype: int64

In [27]:
hainan[~hainan['relatives'].isnull()]['relatives'].value_counts()

0.0    87
1.0    75
Name: relatives, dtype: int64

In [28]:
hainan['Gender'].value_counts()

0.0    84
1.0    78
Name: Gender, dtype: int64

In [29]:
hainan['Age'].describe()

count    162.000000
mean      48.442901
std       17.268708
min        0.250000
25%       36.000000
50%       51.000000
75%       62.000000
max       79.000000
Name: Age, dtype: float64

In [30]:
hainan['Feverdate'].value_counts()

20200130.0    5
20200129.0    3
20200126.0    3
20200119.0    2
20200118.0    2
20200121.0    2
20200203.0    2
20200206.0    1
20200125.0    1
20200202.0    1
20200207.0    1
20200205.0    1
20200117.0    1
20200131.0    1
20200116.0    1
20200113.0    1
20200124.0    1
20200123.0    1
20200122.0    1
20200201.0    1
Name: Feverdate, dtype: int64

## Shanxi Dataset

In [33]:
shanxi = pd.read_stata('./Data/Shanxi/shannxi.dta')
shanxi

Unnamed: 0,Date,No,Noingroup,Gender,Age,Hukou,Source,Arrivedate,Feverdate,Diagnosedate,...,year2,month2,day2,diagdate,feverdate1,fevertime,diagtime,arrivediagtime,mfever,mdiag
0,20200123,1,1,1,42,xianyang,hubei,20200115.0,20200120.0,20200120.0,...,2020.0,1.0,20.0,2020-01-20,2020-01-20,5.0,0.0,5.0,3.333333,1.666667
1,20200123,2,2,0,32,xian,hangzhou,20200114.0,20200116.0,20200120.0,...,2020.0,1.0,20.0,2020-01-20,2020-01-16,2.0,4.0,6.0,3.333333,1.666667
2,20200123,3,3,1,22,xian,hubei,20200117.0,20200120.0,20200121.0,...,2020.0,1.0,21.0,2020-01-21,2020-01-20,3.0,1.0,4.0,3.333333,1.666667
3,20200124,4,1,1,49,ankang,hubei,20200119.0,20200119.0,20200121.0,...,2020.0,1.0,21.0,2020-01-21,2020-01-19,0.0,2.0,2.0,0.000000,1.000000
4,20200124,5,2,1,23,yanan,hubei,20200122.0,20200122.0,20200122.0,...,2020.0,1.0,22.0,2020-01-22,2020-01-22,0.0,0.0,0.0,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,20200215,233,2,0,67,weinan,weinan,,20200203.0,20200203.0,...,2020.0,2.0,3.0,2020-02-03,2020-02-03,,0.0,,,1.500000
233,20200216,234,1,1,46,hanzhong,ningbo,20200118.0,20200126.0,20200126.0,...,2020.0,1.0,26.0,2020-01-26,2020-01-26,8.0,0.0,8.0,8.000000,2.000000
234,20200216,235,2,1,71,hanzhong,hanzhong,,20200131.0,20200204.0,...,2020.0,2.0,4.0,2020-02-04,2020-01-31,,4.0,,8.000000,2.000000
235,20200216,236,3,0,66,hanzhong,hanzhong,,,20200214.0,...,2020.0,2.0,14.0,2020-02-14,NaT,,,,8.000000,2.000000


In [34]:
shanxi.columns

Index(['Date', 'No', 'Noingroup', 'Gender', 'Age', 'Hukou', 'Source',
       'Arrivedate', 'Feverdate', 'Diagnosedate', 'Strangers', 'weakties',
       'strongties', 'relatives', 'ties1', 'ties2', 'ties3', 'ties4', 'age2',
       'source', 'mage', 'tstrangers', 'tweakties', 'tstrongties',
       'trelatives', 'Label', 'timeset', 'Degree', 'WeightedDegree',
       'Eccentricity', 'closnesscentrality', 'harmonicclosnesscentrality',
       'betweenesscentrality', 'Authority', 'Hub', 'modularity_class',
       'pageranks', 'componentnumber', 'clustering', 'triangles',
       'eigencentrality', '_merge', 'year0', 'month0', 'day0', 'date1', 'year',
       'month', 'day', 'year1', 'month1', 'day1', 'arrivedate1', 'year2',
       'month2', 'day2', 'diagdate', 'feverdate1', 'fevertime', 'diagtime',
       'arrivediagtime', 'mfever', 'mdiag'],
      dtype='object')

In [29]:
shanxi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237 entries, 0 to 236
Data columns (total 63 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Date                        237 non-null    int32         
 1   No                          237 non-null    int16         
 2   Noingroup                   237 non-null    int8          
 3   Gender                      237 non-null    int8          
 4   Age                         237 non-null    int8          
 5   Hukou                       237 non-null    object        
 6   Source                      237 non-null    object        
 7   Arrivedate                  112 non-null    float64       
 8   Feverdate                   211 non-null    float64       
 9   Diagnosedate                236 non-null    float64       
 10  Strangers                   237 non-null    int8          
 11  weakties                    237 non-null    int8          

In [35]:
shanxi['Date'].min(), shanxi['Date'].max()

(20200123, 20200216)

In [36]:
shanxi['Degree'].value_counts(), shanxi['Degree'].describe()

(0     108
 1      68
 2      36
 3      20
 4       2
 6       1
 9       1
 11      1
 Name: Degree, dtype: int64,
 count    237.000000
 mean       0.987342
 std        1.351336
 min        0.000000
 25%        0.000000
 50%        1.000000
 75%        2.000000
 max       11.000000
 Name: Degree, dtype: float64)

In [42]:
108/237

0.45569620253164556

In [37]:
shanxi['modularity_class'].value_counts().loc[shanxi['modularity_class'].value_counts() > 1]

19     12
7       7
38      6
109     5
37      5
12      4
91      4
103     4
96      4
124     4
77      4
147     4
13      3
129     3
97      3
102     3
49      3
46      3
72      3
50      3
101     3
114     3
116     2
132     2
136     2
111     2
89      2
44      2
108     2
48      2
53      2
131     2
128     2
123     2
144     2
143     2
110     2
138     2
134     2
70      2
Name: modularity_class, dtype: int64

In [38]:
shanxi[~shanxi['relatives'].isnull()]['relatives'].value_counts()

0    150
1     87
Name: relatives, dtype: int64

In [39]:
shanxi['Gender'].value_counts()

1    129
0    108
Name: Gender, dtype: int64

In [44]:
108/(129+108)

0.45569620253164556

In [40]:
shanxi['Age'].describe()

count    237.000000
mean      45.898734
std       16.581025
min        3.000000
25%       35.000000
50%       45.000000
75%       59.000000
max       89.000000
Name: Age, dtype: float64

In [41]:
shanxi['Hukou'].value_counts()

xian            97
ankang          25
hanzhong        23
wuhan           21
baoji           14
xianyang        13
weinan          11
shangluo         6
yanan            5
tongchuan        4
yulin            3
suizhou          2
tianmen          1
yingcheng        1
pingdingshan     1
yichang          1
nanjing          1
xiaogan          1
lingbao          1
shanghai         1
lantian          1
hancheng         1
henan            1
dazhi            1
yanglin          1
Name: Hukou, dtype: int64

## Big China Dataset

In [51]:
big_dataset = pd.read_csv('./Data/dataset_EN.csv', encoding_errors='replace')

In [4]:
big_dataset

Unnamed: 0,ID,Virus type,Age,Gender,Occupation,Place of Residency,Place of Departure,Place of Transit,Place of Destination,Arrival Date,...,Date_Symptom_Onset,Date_Hospitalisation,Place_Hospitalisation,Date_Confirmation,Date_Disclose,Symptom,Symptom_Severity,Original_Text_CN,Original_Text_EN,Unnamed: 29
0,Anhui_Anqing-1,,49,Male,,,Hubei_Wuhan,,Anhui_Anqing,2020/1/19,...,,,,2020/1/26,2020/1/27,,,����1����/49��/��ס������/1��19����������ٳ�վ�˳�...,"Case 1: Male, 49 years old, currently living i...",
1,Anhui_Anqing-10,,54,Male,Architect,,InnerMongolia_Wuhai,,Anhui_Anqing,2020/1/9,...,2020/1/21,2020/1/25,Susong County People's Hospital-Municipal Hosp...,2020/1/28,2020/1/29,Somatosensory Related-Sign Description,Stable,����10����/54��/������/�ں���ʡ���½�����ҵ������1...,"Case 10: Male, 54 years old, Susong, working i...",
2,Anhui_Anqing-11,,50,Female,,Hubei_Wuhan,Hubei_Wuhan,,Anhui_Anqing,2020/1/17,...,2020/1/18,2020/1/25,Susong County People's Hospital-Municipal Hosp...,2020/1/28,2020/1/29,Respiratory System Related-Symptom Description,Stable,����11��Ů/50��/������/�������人�~������ס��1��17...,"Case 11: female, 50 years old, Susong, living ...",
3,Anhui_Anqing-12,,46,Male,Worker,,Hubei_Huangshi,,Anhui_Anqing,2020/1/21,...,2020/1/21,2020/1/22,Susong County People's Hospital-Municipal Hosp...,2020/1/28,2020/1/29,Somatosensory Related-Sign Description,Stable,"����12����/46��/������/�๤,2019��11�µ�����ʡ��ʯ�...","Case 12: Male, 46 years old, Susong, mason, wo...",
4,Anhui_Anqing-13,,58,Male,Worker,Hubei_Wuhan,Hubei_Wuhan,,Anhui_Anqing,2020/1/17,...,2020/1/21,2020/1/27,Susong County People's Hospital-Municipal Hosp...,2020/1/28,2020/1/29,,Stable,����13����/58��/������/�������人�л��������¸ֲļӹ�...,"Case 13: Male, 58 years old, from Susong, enga...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27199,Zhejiang_Ningbo-256,,,Female,,Zhejiang-Ningbo,,,,,...,,,Designated hospital,,1/7/2022,,,����31��Фĳĳ��Ů��ȷ�ﲡ����2022��1��1-2�գ���Ҫ��켣�...,"Case 31: Xiao Moumou, female, confirmed case. ...",
27200,Zhejiang_Ningbo-257,,,Female,,Zhejiang-Ningbo,,,,,...,,,Designated hospital,,1/7/2022,,,����32����ĳĳ��Ů��ȷ�ﲡ����2022��1��1-2�գ���Ҫ��켣...,"Case 32: Song Moumou, female, confirmed case. ...",
27201,Zhejiang_Ningbo-258,,,Female,,Zhejiang-Ningbo,,,,,...,,,Designated hospital,,1/7/2022,,,����33����ĳĳ��Ů��ȷ�ﲡ����2022��1��1-3�գ���Ҫ��켣...,"Case 33: Fu Moumou, female, confirmed case. Fr...",
27202,Chongqing-22,,,,,,Singapore,,Chongqing,,...,,,,1/7/2022,1/8/2022,,,1��7��0��24ʱ��������������������֢״��Ⱦ��1�����¼...,"From 0 to 24:00 on January 7, 1 new case of as...",


In [47]:
big_dataset.columns

Index(['ID', 'Virus type', 'Age', 'Gender', 'Occupation', 'Place of Residency',
       'Place of Departure', 'Place of Transit', 'Place of Destination',
       'Arrival Date', 'Earliest Possible Date', 'Latest Possible Date',
       'Place and Event', 'Venue', 'With Whom', 'Contact_ID_Relationship',
       'Place_Admission', 'Method_Discovery', 'Date_Quarantine',
       'Place_Quarantine', 'Date_Symptom_Onset', 'Date_Hospitalisation',
       'Place_Hospitalisation', 'Date_Confirmation', 'Date_Disclose',
       'Symptom', 'Symptom_Severity', 'Original_Text_CN', 'Original_Text_EN',
       'Unnamed: 29'],
      dtype='object')

### Filter for entries with contact information (actually, this is stupid)

In [52]:
#cases_with_contact_info = big_dataset[~big_dataset['Contact_ID_Relationship'].isnull()]
#cases_with_contact_info.describe()
cases_with_contact_info = big_dataset

In [6]:
cases_with_contact_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5716 entries, 1 to 27016
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       5716 non-null   object
 1   Virus type               230 non-null    object
 2   Age                      5374 non-null   object
 3   Gender                   5600 non-null   object
 4   Occupation               969 non-null    object
 5   Place of Residency       4370 non-null   object
 6   Place of Departure       2058 non-null   object
 7   Place of Transit         769 non-null    object
 8   Place of Destination     2178 non-null   object
 9   Arrival Date             2219 non-null   object
 10  Earliest Possible Date   1749 non-null   object
 11  Latest Possible Date     2538 non-null   object
 12  Place and Event          2572 non-null   object
 13  Venue                    2802 non-null   object
 14  With Whom                3909 non-null 

In [7]:
cases_with_contact_info[['Contact_ID_Relationship', 'Virus type', 'Occupation', 'Place of Residency', 'Place and Event', 'Venue', 'With Whom', 'Symptom', 'Symptom_Severity']]

Unnamed: 0,Contact_ID_Relationship,Virus type,Occupation,Place of Residency,Place and Event,Venue,With Whom,Symptom,Symptom_Severity
1,anqing-14 (wife) anqing-10 (husband),,Architect,,Hainan Work,,,Somatosensory Related-Sign Description,Stable
5,anqing-14 (wife) anqing-10 (husband),,,,,,,Respiratory System Related-Symptom Description,Stable
11,anqing-2 (father) & anqing-9 (son),,,,Wuhan Travel,,,,
13,anqing-21 (son) & anqing-22 (mother) & anqing-...,,Seller,Hubei_Wuhan,Work In Wuhan,,,Somatosensory Related-Sign Description,Stable
14,anqing-21 (son) & anqing-22 (mother) & anqing-...,,,,,Family,Family member,Somatosensory Related-Symptom Description/Soma...,Stable
...,...,...,...,...,...,...,...,...,...
27012,Tianjin-833 (confirmed cases) & Tianjin-854 (c...,,ѧ��,,,,confirmed cases,,
27013,Tianjin-833 (confirmed cases) & Tianjin-855 (c...,,ѧ��,,,,confirmed cases,,
27014,Tianjin-833 (confirmed cases) & Tianjin-856 (c...,,ѧ��,,,,confirmed cases,,
27015,Tianjin-833 (confirmed cases) & Tianjin-857 (c...,,ѧ��,,,,confirmed cases,,


In [54]:
cases_with_contact_info['Date_Disclose'] = pd.to_datetime(cases_with_contact_info['Date_Disclose'], infer_datetime_format=True, errors='coerce')
print(cases_with_contact_info.shape[0])
cases_with_contact_info = cases_with_contact_info[~cases_with_contact_info['Date_Disclose'].isnull()]
print(cases_with_contact_info.shape[0])

27204
27198


In [55]:
cases_with_contact_info.drop(cases_with_contact_info['Date_Disclose'][cases_with_contact_info['Date_Disclose'] > pd.to_datetime('2022/12/01')].index, inplace=True)
print(cases_with_contact_info.shape[0])

26961


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases_with_contact_info.drop(cases_with_contact_info['Date_Disclose'][cases_with_contact_info['Date_Disclose'] > pd.to_datetime('2022/12/01')].index, inplace=True)


In [56]:
cases_with_contact_info['Date_Disclose'].max(), cases_with_contact_info['Date_Disclose'].min()

(Timestamp('2022-08-14 00:00:00'), Timestamp('2020-01-01 00:00:00'))

In [57]:
pd.to_numeric(cases_with_contact_info['Age'], errors='coerce').dropna().describe()

count    20046.000000
mean        41.745560
std         18.026487
min          0.000000
25%         30.000000
50%         41.000000
75%         54.000000
max        644.000000
Name: Age, dtype: float64

In [58]:
cases_with_contact_info['Gender'].value_counts()

Male       9581
Female     9108
male       1515
Man         277
Female       59
female       46
Male         33
NA           17
Man           6
 Female       5
��            4
Ů             1
56            1
 Male         1
 male         1
student       1
Name: Gender, dtype: int64

In [59]:
cases_with_contact_info['Occupation'].value_counts()

Student                  970
Worker                   548
NA                       471
Employee                 326
Unemployed               126
                        ... 
Fenggang Office Staff      1
Clinic To Work             1
Kindergarten Teacher       1
welders                    1
Barber                     1
Name: Occupation, Length: 361, dtype: int64

In [60]:
cases_with_contact_info['Place of Residency'].value_counts()

Shaanxi_Xi'an               1984
Hubei_Wuhan                 1715
Hebei_Shijiazhuang           888
Inner Mongolia_Hulunbuir     547
Henan_Xuchang                367
                            ... 
Hubei_Chibi                    1
Fiji                           1
Inner Mongolia-Ordos           1
Hubei_Zaoyang                  1
Shenzhen_Futian                1
Name: Place of Residency, Length: 812, dtype: int64

In [61]:
cases_with_contact_info['Virus type'].value_counts()

Delta      462
Omicron    139
Name: Virus type, dtype: int64

In [62]:
cases_with_contact_info['Place and Event'].value_counts()

NA                                                                       754
Wuhan Travel                                                             635
Dinner                                                                   359
Wuhan Residence                                                          337
Work In Wuhan                                                            266
                                                                        ... 
Zhengzhou play/school                                                      1
Zhengzhou shopping/dining/seeing a doctor/refueling/exercise/tutoring      1
Zhengzhou play/school/dining/seeing                                        1
Zhengzhou on duty                                                          1
shopping/travel/dining                                                     1
Name: Place and Event, Length: 1958, dtype: int64

In [63]:
cases_with_contact_info['Venue'].value_counts()

NA                                                   940
Family                                               938
Outdoor                                              791
Social place                                         503
Indoor                                               382
                                                    ... 
restaurant/factory                                     1
Hotel/chess room                                       1
Sanitary / Bath Center / Supermarket / Restaurant      1
Public places/shopping malls/hospitals                 1
market                                                 1
Name: Venue, Length: 110, dtype: int64

In [64]:
cases_with_contact_info['With Whom'].value_counts()

Confirmed case                                                                                         2266
Family member                                                                                          2060
confirmed cases                                                                                         780
NA                                                                                                      297
Confirmed cases                                                                                         172
Wuhan personnel                                                                                         107
family                                                                                                   95
Colleague                                                                                                94
Returnee from Wuhan                                                                                      78
Family                      

In [10]:
cases_with_contact_info['Symptom'].value_counts()

Somatosensory Related-Sign Description                                                                                                                                          447
Respiratory System Related-Symptom Description/Somatosensory Related-Sign Description                                                                                           144
NA                                                                                                                                                                              141
Respiratory System Related-Symptom Description                                                                                                                                  122
Somatosensory Related-Symptom Description/Somatosensory Related-Sign Description                                                                                                 76
                                                                                                    

### Build network

In [6]:
cases_with_contact_info[['Date_Disclose', 'Contact_ID_Relationship', 'ID']]

Unnamed: 0,Date_Disclose,Contact_ID_Relationship,ID
1,2020-01-29,anqing-14 (wife) anqing-10 (husband),Anhui_Anqing-10
5,2020-01-30,anqing-14 (wife) anqing-10 (husband),Anhui_Anqing-14
11,2020-01-27,anqing-2 (father) & anqing-9 (son),Anhui_Anqing-2
13,2020-02-01,anqing-21 (son) & anqing-22 (mother) & anqing-...,Anhui_Anqing-21
14,2020-02-01,anqing-21 (son) & anqing-22 (mother) & anqing-...,Anhui_Anqing-22
...,...,...,...
27012,2022-01-10,Tianjin-833 (confirmed cases) & Tianjin-854 (c...,Tianjin-854
27013,2022-01-10,Tianjin-833 (confirmed cases) & Tianjin-855 (c...,Tianjin-855
27014,2022-01-10,Tianjin-833 (confirmed cases) & Tianjin-856 (c...,Tianjin-856
27015,2022-01-10,Tianjin-833 (confirmed cases) & Tianjin-857 (c...,Tianjin-857


In [7]:
import re

In [8]:
s = 'anqing-14 (wife) anqing-10 (husband)'
exp = re.compile('\w+-\d+', re.IGNORECASE)
exp.findall(s)

['anqing-14', 'anqing-10']

1. Replace whitespace with _
2. Split on _
3. Extract contact with regex: search for either phrase in contact_id_relationship, e.g. search for anhui and anqing
4. Do the same for id
5. match

In [9]:
places = set()
for place in cases_with_contact_info['Place_Admission'].unique():
    try:
        string = place.replace(' ', '_')
    except AttributeError:
        continue

    string = re.sub('_+', '_', string)
    string = re.sub('^_', '', string)
    string = re.sub('_$', '', string)
    string = re.sub(',', '', string)
    try:
        a, b = string.split('_')
        places.add(a)
        places.add(b)
    except ValueError:
        places.add(string)

In [44]:
rexp = re.compile('(?:\w+\_)?(\w+[-|_]\d+)')
rexp.findall('Anhui_Hefei-175'), rexp.findall('Beijing_432'), rexp.findall('Guangdong_Shenzhen_670')

(['Hefei-175'], ['Beijing_432'], ['Shenzhen_670'])

In [41]:
rexp.findall('henan_xuchang-303')

['xuchang-303']

In [66]:
exp = re.compile('(?:\w+\_)?(\w+[-|_]\d+)', re.IGNORECASE)
edge_list = list()
node_list = set()
failed = set()
for index, row in cases_with_contact_info.iterrows():
    try:
        row_id = exp.findall(row['ID'])[0].lower().replace('_', '-')
    except IndexError:
        print(row['ID'])
        continue
    try:
        contacts = exp.findall(row['Contact_ID_Relationship'])
    except TypeError:
        node_list.add(row_id)
    for contact in contacts:
        try:
            match = next(s for s in cases_with_contact_info['ID'].apply(lambda x: x.lower()) if contact.lower().replace('_', '-') in s)
            if row_id != match:
                edge_list.append((row_id, match))
        except StopIteration:
            failed.add((row_id, contact.lower()))

Guangdong_Shenzhen
Shandong_Rizhao - 100
Shandong_Rizhao - 101
Shandong_Wingdao - 100


In [67]:
failed

{('an-68', 'xian-67'),
 ('an-88', 'xian-117'),
 ('huaian-74', 'huaian-8'),
 ('beijing-585', 'zhuhai-126'),
 ('harbin-35', 'haerbin-50'),
 ('liangjiangnewdistrict-5', 'liangjiangxinqu-15'),
 ('yancheng-6', 'yangcheng-5'),
 ('an-87', 'xian-87'),
 ('an-405', 'xian-42'),
 ('garz-76', 'ganzi-8'),
 ('an-83', 'xian-90'),
 ('jiangsu-26', 'yangcheng-26'),
 ('shanghai-1666', 'zhuhai-126'),
 ('beijing-557', 'beijing_548'),
 ('beijing-564', 'beijing_548'),
 ('beijing-554', 'beijing_554'),
 ('wuhu-12', 'toling-25'),
 ('an-352', 'xian-74'),
 ('beijing-526', 'beijing_513'),
 ('beijing-504', 'beijing_500'),
 ('xuzhoujiangsu-58', 'xuzhou-5'),
 ('xiamen-533', 'zhuhai-126'),
 ('shanghai-1580', 'zhuhai-126'),
 ('an-878', 'xian-117'),
 ('shanghai-1600', 'zhuhai-126'),
 ('liangjiangnewdistrict-15', 'liangjiangxinqu-15'),
 ('shanghai-1658', 'zhuhai-126'),
 ('shanghai-1794', 'zhuhai-126'),
 ('xuzhoujiangsu-63', 'xuzhou-1'),
 ('harbin-47', 'haerbin-49'),
 ('xuzhoujiangsu-19', 'xuzhou-5'),
 ('liangjiangnewdistr

In [50]:
cases_with_contact_info['ID'].loc[cases_with_contact_info['ID'].str.contains('Beijing')].to_list()

['Beijing_432',
 'Beijing_433',
 'Beijing_448',
 'Beijing_449',
 'Beijing_450',
 'Beijing_451',
 'Beijing_452',
 'Beijing_453',
 'Beijing_454',
 'Beijing_455',
 'Beijing_456',
 'Beijing_457',
 'Beijing_458',
 'Beijing_461',
 'Beijing_462',
 'Beijing_463',
 'Beijing_492',
 'Beijing_493',
 'Beijing_494',
 'Beijing_495',
 'Beijing_496',
 'Beijing_497',
 'Beijing_498',
 'Beijing_500',
 'Beijing_501',
 'Beijing_502',
 'Beijing_513',
 'Beijing_514',
 'Beijing_515',
 'Beijing_535',
 'Beijing_536',
 'Beijing_537',
 'Beijing_538',
 'Beijing_539',
 'Beijing_550',
 'Beijing_551',
 'Beijing_552',
 'Beijing_553',
 'Beijing_554',
 'Beijing_557',
 'Beijing-11',
 'Beijing-12',
 'Beijing-13',
 'Beijing-14',
 'Beijing-144',
 'Beijing-146',
 'Beijing-17',
 'Beijing-173',
 'Beijing-174',
 'Beijing-175',
 'Beijing-18',
 'Beijing-19',
 'Beijing-199',
 'Beijing-200',
 'Beijing-201',
 'Beijing-203',
 'Beijing-207',
 'Beijing-208',
 'Beijing-211',
 'Beijing-212',
 'Beijing-213',
 'Beijing-214',
 'Beijing-215',

In [68]:
network = nx.Graph()
network.add_edges_from(edge_list)
network.add_nodes_from(node_list)

In [69]:
len(network.nodes)

29529

In [84]:

with open('network.json', mode='w') as of:
    json.dump(nx.node_link_data(network), of)

In [4]:
with open('network.json', mode='r') as f:
    network = nx.node_link_graph(json.load(f))

In [5]:
array = np.array(nx.degree_histogram(network))
np.argsort(array)

array([660, 810, 809, ...,   3,   1,   2], dtype=int64)

In [71]:
np.sort(array)

array([    0,     0,     0, ...,  3194,  6179, 12818])

In [9]:
np.argsort(array) * np.sort(array)

array([    0,     0,     0, ...,  9582,  6179, 25636], dtype=int64)

In [81]:
from collections import OrderedDict

In [77]:
d = dict()
for i, degree in enumerate(np.argsort(array)):
    count = np.sort(array)[i]
    if count > 0:
        d[degree] = count

In [83]:
sum(d.values())

29529

In [82]:
OrderedDict(sorted(d.items()))

OrderedDict([(0, 2012),
             (1, 6179),
             (2, 12818),
             (3, 3194),
             (4, 1619),
             (5, 966),
             (6, 504),
             (7, 350),
             (8, 446),
             (9, 194),
             (10, 152),
             (11, 131),
             (12, 150),
             (13, 70),
             (14, 74),
             (15, 64),
             (16, 64),
             (17, 51),
             (18, 20),
             (19, 24),
             (20, 23),
             (21, 11),
             (22, 35),
             (23, 33),
             (24, 20),
             (25, 32),
             (26, 9),
             (27, 14),
             (28, 15),
             (29, 8),
             (30, 4),
             (31, 13),
             (32, 3),
             (33, 11),
             (34, 7),
             (35, 7),
             (37, 9),
             (38, 6),
             (39, 4),
             (40, 2),
             (41, 6),
             (42, 4),
             (43, 2),
             (4

In [72]:
connected_components = nx.connected_components(network)
connected_components = np.array(list(connected_components))

In [73]:
len(connected_components)

3573

In [74]:
from collections import Counter
Counter(np.array([len(x) for x in connected_components]) > 3)

Counter({True: 1333, False: 2240})

# Covid cases without contact information

In [None]:
covid_covariates = pd.read_csv('./Data/latestdata.csv')

  covid_covariates = pd.read_csv('./Data/latestdata.csv')


In [None]:
covid_covariates.columns

Index(['ID', 'age', 'sex', 'city', 'province', 'country', 'latitude',
       'longitude', 'geo_resolution', 'date_onset_symptoms',
       'date_admission_hospital', 'date_confirmation', 'symptoms',
       'lives_in_Wuhan', 'travel_history_dates', 'travel_history_location',
       'reported_market_exposure', 'additional_information',
       'chronic_disease_binary', 'chronic_disease', 'source',
       'sequence_available', 'outcome', 'date_death_or_discharge',
       'notes_for_discussion', 'location', 'admin3', 'admin2', 'admin1',
       'country_new', 'admin_id', 'data_moderator_initials',
       'travel_history_binary'],
      dtype='object')

In [None]:
covid_covariates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 33 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ID                        object 
 1   age                       object 
 2   sex                       object 
 3   city                      object 
 4   province                  object 
 5   country                   object 
 6   latitude                  float64
 7   longitude                 float64
 8   geo_resolution            object 
 9   date_onset_symptoms       object 
 10  date_admission_hospital   object 
 11  date_confirmation         object 
 12  symptoms                  object 
 13  lives_in_Wuhan            object 
 14  travel_history_dates      object 
 15  travel_history_location   object 
 16  reported_market_exposure  object 
 17  additional_information    object 
 18  chronic_disease_binary    bool   
 19  chronic_disease           object 
 20  source                  

In [None]:
covid_covariates[['ID', 'age', 'sex', 'city', 'province', 'country', 'latitude', 'longitude', 'symptoms', 'chronic_disease_binary', 'chronic_disease', 'date_confirmation', 'additional_information']]

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,symptoms,chronic_disease_binary,chronic_disease,date_confirmation,additional_information
0,000-1-1,,male,Shek Lei,Hong Kong,China,22.365019,114.133808,,False,,14.02.2020,Case 55; mainland China travel via the Lok Ma ...
1,000-1-10,78,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,,False,,21.02.2020,Hospitalized on 12.02.2020 for other reasons
2,000-1-100,61,female,,,Singapore,1.353460,103.815100,,False,,14.02.2020,"Case 65; family member of Case 50, a DBS emplo..."
3,000-1-1000,,,Zhengzhou City,Henan,China,34.629310,113.468000,,False,,26.01.2020,
4,000-1-10000,,,Pingxiang City,Jiangxi,China,27.513560,113.902900,,False,,14.02.2020,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2676306,010-99995,52.0,female,Calleria,Coronel Portillo,Peru,-8.378190,-74.539700,,False,,17.05.2020,
2676307,010-99996,52.0,female,Pueblo Libre,Lima,Peru,-12.076530,-77.067350,,False,,17.05.2020,
2676308,010-99997,52.0,female,Comas,Lima,Peru,-11.932980,-77.040850,,False,,17.05.2020,
2676309,010-99998,52.0,male,Callao,Callao,Peru,-12.000740,-77.118240,,False,,17.05.2020,


### Might be possible to identify contact information from ```additional_information``` column; would probably need some kind of text mining approach

In [None]:
tmp = covid_covariates[~covid_covariates['additional_information'].isnull()]

In [None]:
with pd.option_context('display.max_colwidth', None):
    print(tmp[tmp['additional_information'].str.contains('Case')]['additional_information'])

0                                           Case 55; mainland China travel via the Lok Ma Chau border crossing
2                                             Case 65; family member of Case 50, a DBS employee, as is Case 55
113                                                        Case 66; linked to the Grace Assembly of God church
224                              Case 67; linked to the Grace Assembly of God church; family member of Case 61
335       Case 68; Singapore Citizen with no recent travel history to China. She is a family member of Case 66
                                                          ...                                                 
650422                                                                     Close Contact of New Brunswick Case
658592                                                                                 British Columbia Case 1
658593                                                                                          Spouse of Case
6