In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib as plt
from pandas_profiling import ProfileReport
from pivottablejs import pivot_ui

In [2]:
data_path='data/'
saved_path='saved_files/'

## Patient as a node

In [3]:
patient_korea=pd.read_csv(data_path+'PatientInfo.csv')

In [42]:
patient_korea.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,...,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,actual_age,deceased,is_male,timestamp
783,1400000001,1.0,female,1985.0,30s,China,Incheon,Incheon,,overseas inflow,...,,1/19/20,1/20/20,2/6/20,,released,35.0,False,False,0 days
0,1000000001,2.0,male,1964.0,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,...,75.0,1/22/20,1/23/20,2/5/20,,released,56.0,False,True,3 days
988,2000000001,3.0,male,1966.0,50s,Korea,Gyeonggi-do,Goyang-si,,overseas inflow,...,16.0,,1/26/20,2/12/20,,released,54.0,False,True,6 days
989,2000000002,4.0,male,1964.0,50s,Korea,Gyeonggi-do,Pyeongtaek-si,,overseas inflow,...,95.0,,1/27/20,2/9/20,,released,56.0,False,True,7 days
1,1000000002,5.0,male,1987.0,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,...,31.0,,1/30/20,3/2/20,,released,33.0,False,True,10 days


manually deleted errorneous `infected_by` values that cause cycle

In [5]:
#erroneous patient ID codes
patient_korea.replace(2002000001, 2000000001, inplace=True)
patient_korea.replace(2017000005, 2000000005, inplace=True)
patient_korea.replace(6016000007, np.nan, inplace=True)
patient_korea.replace(6100000384, 6100000038, inplace=True)

In [8]:
#Preprocessing
#Typos in city
patient_korea.replace('pocheon-si', 'Pocheon-si', inplace=True)
patient_korea.replace('Chunchun-si', 'Chuncheon-si', inplace=True)
patient_korea.replace('etc', np.nan, inplace=True)
patient_korea.replace('Wuhan', np.nan, inplace=True)

#For those who have birth year
current_year=2020
patient_korea.loc[~patient_korea['birth_year'].isna(),'actual_age']=patient_korea.loc[~patient_korea['birth_year'].isna(),'birth_year'].apply(lambda x: current_year-x)

#For those who don't have birth year
patient_korea.loc[patient_korea['birth_year'].isna(),'actual_age']=patient_korea.loc[patient_korea['birth_year'].isna(),'age'].fillna('').apply(lambda x: int(x[:-1]) if x is not '' else np.nan)

#deceased or not
patient_korea['deceased']=patient_korea['state']=='deceased'

#Missing fine grained location
patient_korea.loc[patient_korea['city'].isna(), 'city']=patient_korea.loc[patient_korea['city'].isna(), 'province']

#is man?
patient_korea.loc[~patient_korea['sex'].isna(),'is_male']=patient_korea.loc[~patient_korea['sex'].isna(),'sex']=='male'


In [9]:
#timestamp: day0 to confirmed date
day0=pd.to_datetime(patient_korea['confirmed_date']).min()
patient_korea['timestamp']=pd.to_datetime(patient_korea['confirmed_date']).apply(lambda x: x-day0)
patient_korea.sort_values(by='timestamp',inplace=True)


In [10]:
#delete duplicates
patient_korea=patient_korea[~patient_korea['patient_id'].duplicated( keep='first')]

In [11]:
#fill missing ages
#no_age_but_birth_year=(patient_korea['age'].isna())&~(patient_korea['birth_year'].isna())
#patient_korea.loc[no_age_but_birth_year, 'age']=patient_korea.loc[no_age_but_birth_year, 'birth_year'].apply(lambda x: 2020-x)

In [12]:
features_of_interest=['patient_id','sex', 'is_male', 'actual_age', 'province', 'city', 'deceased', 'timestamp', 'disease']
node_feature=patient_korea[features_of_interest]

In [13]:
#show missing values
node_feature.isna().sum()

patient_id       0
sex             94
is_male         94
actual_age     105
province         0
city             0
deceased         0
timestamp        0
disease       3109
dtype: int64

In [14]:
#Link regional data
region=pd.read_csv(data_path+'Region.csv')
node_feature=pd.merge(node_feature, region, how='left', on=['province', 'city'])
node_feature.drop(columns=['code'], inplace=True)

In [15]:
#Show missing values
node_feature.isna().sum()

patient_id                     0
sex                           94
is_male                       94
actual_age                   105
province                       0
city                           0
deceased                       0
timestamp                      0
disease                     3109
latitude                       1
longitude                      1
elementary_school_count        1
kindergarten_count             1
university_count               1
academy_ratio                  1
elderly_population_ratio       1
elderly_alone_ratio            1
nursing_home_count             1
dtype: int64

In [16]:
node_feature.head()

Unnamed: 0,patient_id,sex,is_male,actual_age,province,city,deceased,timestamp,disease,latitude,longitude,elementary_school_count,kindergarten_count,university_count,academy_ratio,elderly_population_ratio,elderly_alone_ratio,nursing_home_count
0,1400000001,female,False,35.0,Incheon,Incheon,False,0 days,,37.456188,126.70592,250.0,403.0,7.0,1.27,13.2,5.8,4497.0
1,1000000001,male,True,56.0,Seoul,Gangseo-gu,False,3 days,,37.551166,126.849506,36.0,56.0,1.0,1.17,14.39,5.7,1080.0
2,2000000001,male,True,54.0,Gyeonggi-do,Goyang-si,False,6 days,,37.658363,126.831961,84.0,171.0,2.0,1.88,12.82,5.2,1608.0
3,2000000002,male,True,56.0,Gyeonggi-do,Pyeongtaek-si,False,7 days,,36.992293,127.112709,58.0,108.0,3.0,1.39,12.13,5.6,765.0
4,1000000002,male,True,33.0,Seoul,Jungnang-gu,False,10 days,,37.606832,127.092656,23.0,31.0,1.0,0.7,16.65,6.9,689.0


## Case as a virtual patient

In [17]:
#case=pd.read_csv(data_path+'korea/Case.csv')
#case['infection_case'].unique()

In [18]:
#Unique local cluster infection cases
patient_korea['infection_case'].unique()

array(['overseas inflow', 'contact with patient', 'Shincheonji Church',
       nan, 'Seongdong-gu APT', 'Cheongdo Daenam Hospital',
       'Pilgrimage to Israel', 'Onchun Church',
       "Eunpyeong St. Mary's Hospital", 'Milal Shelter',
       'gym facility in Cheonan', 'Suyeong-gu Kindergarten',
       'Geochang Church', 'Changnyeong Coin Karaoke',
       'Gyeongsan Seorin Nursing Home', 'Bonghwa Pureun Nursing Home',
       'Dongan Church', 'Gyeongsan Cham Joeun Community Center',
       'Gyeongsan Jeil Silver Town', 'gym facility in Sejong',
       'Guro-gu Call Center', 'Ministry of Oceans and Fisheries',
       'River of Grace Community Church'], dtype=object)

In [19]:
# case=pd.read_csv(data_path+'korea/Case.csv')
# #select group case
# case_feature=case.loc[(case['group'])&(case['latitude']!='-'),['case_id', 'province', 'city','infection_case','latitude', 'longitude']]
# #get most frequenct demographic feature for each group infection case
case_feature=pd.merge(patient_korea.groupby('infection_case')['is_male', 'deceased'].agg(lambda x:x.value_counts().index[0]),
          patient_korea.groupby('infection_case')['actual_age'].mean(), left_index=True, right_index=True).reset_index()
# #add most frequent demographic feature 
#case_feature=pd.merge(case_feature, case_demo, how='inner',on='infection_case')

case_feature=pd.merge(case_feature, 
          patient_korea.groupby('infection_case')['timestamp'].first().reset_index(), 
          how='inner', on='infection_case')

# case_feature=pd.merge(case_feature, region.drop(columns=['latitude','longitude','code']), how='left', on=['province', 'city'])




In [20]:
case_feature.head()

Unnamed: 0,infection_case,is_male,deceased,actual_age,timestamp
0,Bonghwa Pureun Nursing Home,False,False,69.741935,44 days
1,Changnyeong Coin Karaoke,True,False,31.25,39 days
2,Cheongdo Daenam Hospital,True,False,50.619048,30 days
3,Dongan Church,True,False,38.647059,44 days
4,Eunpyeong St. Mary's Hospital,False,False,59.9375,32 days


In [21]:
#case_feature.replace('Boram-dong', 'Sejong', inplace=True)
#case_feature.replace('Eojin-dong', 'Sejong', inplace=True)

## Aggregate group infection and individual patient

In [22]:
# #aggregate patient demo and virtual patient's demo
agg_feature=pd.concat([case_feature.rename(columns={'infection_case':'id'}),
            node_feature.rename(columns={'patient_id':'id'})], axis=0)
agg_feature.sort_values(by='timestamp', inplace=True)#.head()
#agg_feature = node_feature.rename(columns={'patient_id':'id'})

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
agg_feature.head()

Unnamed: 0,academy_ratio,actual_age,city,deceased,disease,elderly_alone_ratio,elderly_population_ratio,elementary_school_count,id,is_male,kindergarten_count,latitude,longitude,nursing_home_count,province,sex,timestamp,university_count
0,1.27,35.0,Incheon,False,,5.8,13.2,250.0,1400000001,False,403.0,37.456188,126.70592,4497.0,Incheon,female,0 days,7.0
21,,33.201456,,False,,,,,overseas inflow,True,,,,,,,0 days,
1,1.17,56.0,Gangseo-gu,False,,5.7,14.39,36.0,1000000001,True,56.0,37.551166,126.849506,1080.0,Seoul,male,3 days,1.0
2,1.88,54.0,Goyang-si,False,,5.2,12.82,84.0,2000000001,True,171.0,37.658363,126.831961,1608.0,Gyeonggi-do,male,6 days,2.0
3,1.39,56.0,Pyeongtaek-si,False,,5.6,12.13,58.0,2000000002,True,108.0,36.992293,127.112709,765.0,Gyeonggi-do,male,7 days,3.0


In [24]:
#agg_feature['age'].fillna('')
#agg_feature['age']=agg_feature['age'].fillna('').apply(lambda x: int(x[:-1]) if x!='' else '')

In [25]:
#agg_feature.loc[~agg_feature['sex'].isna(),'is_male']=agg_feature.loc[~agg_feature['sex'],'sex']=='male'
#agg_feature.drop(columns=['city'],inplace=True)

In [26]:
#missing age
agg_feature['actual_age']=agg_feature['actual_age'].fillna(agg_feature.groupby(['province','city'])['actual_age'].transform('mean'))
agg_feature['actual_age']=agg_feature['actual_age'].fillna(agg_feature.groupby(['province'])['actual_age'].transform('mean'))


In [27]:
agg_feature['latitude']=agg_feature['latitude'].astype(float)
agg_feature['longitude']=agg_feature['longitude'].astype(float)

In [28]:
#agg_feature_drop = agg_feature.dropna()

## Edge index

In [29]:
edge_ind=patient_korea[['patient_id','infected_by']].dropna()


In [30]:
#edge_group=patient_korea.loc[patient_korea['infection_case'].isin(case_feature['infection_case'].unique()), ['patient_id','infection_case']]
#edge_group=edge_group.loc[~edge_group['patient_id'].isin(edge_ind['patient_id'].unique()),:]

edge_group=patient_korea.loc[~patient_korea['patient_id'].isin(edge_ind['patient_id'].unique()), ['patient_id','infection_case']].dropna()
edge_group=edge_group[~(edge_group['infection_case']=='contact with patient')]

In [31]:
edge_agg=pd.concat([edge_ind.astype(int).astype(str), edge_group.rename(columns={'infection_case':'infected_by'})])


In [32]:
len(edge_ind),len(edge_group)
#pd.concat([edge_ind.astype(int).astype(str), edge_group.rename(columns={'infection_case':'infected_by'})])

(725, 884)

## Item to idx mapping

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
# # Focus on patients who have edges
# le=LabelEncoder()
# le.fit(np.concatenate((edge_agg['patie nt_id'].unique(), edge_agg['infected_by'] )).astype(str))

# edge_agg['patient_id']=le.transform(edge_agg['patient_id'].astype(str))
# edge_agg['infected_by']=le.transform(edge_agg['infected_by'].astype(str))

# agg_feature_drop=agg_feature_drop.loc[agg_feature_drop['id'].astype(str).isin(le.classes_),:]
# agg_feature_drop['id']=le.transform(agg_feature_drop['id'].astype(str))\


#le.fit(agg_feature['id'].astype(str))

#le.fit(agg_feature_drop['id'].astype(str))
#le.fit(np.concatenate([agg_feature['id'].astype(str),
#                      edge_agg['infected_by'].astype(str)]))

#agg_feature['id']=le.transform(agg_feature['id'].astype(str))
##agg_feature_drop['id']=le.transform(agg_feature_drop['id'].astype(str))

#agg_feature=agg_feature.sort_values(by='id')#.set_index(keys='id')
##agg_feature_drop=agg_feature_drop.sort_values(by='id').set_index(keys='id')

#agg_feature=agg_feature.sort_values(by='timestamp').reset_index(drop=True)

# edge_agg['patient_id']=le.transform(edge_agg['patient_id'].astype(str))
# edge_agg=edge_agg.loc[edge_agg['infected_by'].isin(le.classes_)]
# edge_agg['infected_by']=le.transform(edge_agg['infected_by'].astype(str))

In [35]:
# Include all patients
le=LabelEncoder()
le.fit(np.concatenate((edge_agg['patient_id'].astype(str), edge_agg['infected_by'].astype(str))))
edge_agg['patient_id']=le.transform(edge_agg['patient_id'].astype(str))
edge_agg['infected_by']=le.transform(edge_agg['infected_by'].astype(str))

#agg_feature['id']=le.transform(agg_feature['id'].astype(str))
#agg_feature=agg_feature.sort_values(by='id')#.set_index(keys='id')
#agg_feature=agg_feature.sort_values(by='timestamp').reset_index(drop=True)


In [36]:
agg_feature_w_edge=agg_feature[agg_feature['id'].astype(str).isin(le.classes_)]

In [37]:
agg_feature_w_edge.rename(columns={'id':'name'}, inplace=True)
agg_feature_w_edge['name']=agg_feature_w_edge['name'].astype(str)
agg_feature_w_edge['id']=le.transform(agg_feature_w_edge['name'].astype(str))
agg_feature_w_edge['is_male']=agg_feature_w_edge['is_male'].astype(bool)
agg_feature_w_edge.sort_values(by='timestamp', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

In [38]:
#Patients demographic features table
temp=agg_feature_w_edge.loc[agg_feature_w_edge['name'].apply(lambda x: x.isdigit())]
pivot_ui(temp, outfile_path='plots/patient_demographics.html')


In [39]:
# missing sex
# agg_feature['is_male']=agg_feature.groupby(['province','city'])['is_male'].apply(lambda x: x.fillna(x.mode()))
# agg_feature['is_male']=
# agg_feature['is_male'].fillna()
# agg_feature.groupby(['province','city'])['is_male'].transform(lambda x:x.value_counts().index[0] if len(x)>0 else -1)
# agg_feature['actual_age']=agg_feature['actual_age'].fillna(agg_feature.groupby(['province'])['actual_age'].transform('mean'))


In [40]:
#Save
pickle.dump(agg_feature_w_edge, open(saved_path+'node_feature.p', 'wb'))
pickle.dump(edge_agg, open(saved_path+'edge_index.p', 'wb'))
pickle.dump(le, open(saved_path+'id_LabelEncoder.p','wb'))