# Load raw line list data and check with tensorflow data valiaton (TFDV)
### Import dependencies and get path

In [1]:
from feather import read_dataframe, write_dataframe
from functions_clean import *

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%load_ext rpy2.ipython
!pwd

/Users/yensia-low/PycharmProjects/nCoV2019/notebooks


### Load data

In [30]:
# Read df.feather from gspread_url (see get_data.py)
df = read_dataframe('../data/df_raw.feather')
df.columns
#combined_dat = read_dataframe('data/combined_dat.feather')

Index(['ID', 'age', 'sex', 'city', 'province', 'country',
       'wuhan(0)_not_wuhan(1)', 'latitude', 'longitude', 'geo_resolution',
       'date_onset_symptoms', 'date_admission_hospital', 'date_confirmation',
       'symptoms', 'lives_in_Wuhan', 'travel_history_dates',
       'travel_history_location', 'reported_market_exposure',
       'additional_information', 'chronic_disease_binary', 'chronic_disease',
       'source', 'sequence_available', 'outcome', 'date_death_or_discharge',
       'notes_for_discussion', 'location', 'admin3', 'admin2', 'admin1',
       'country_new', 'admin_id', 'data_moderator_initials'],
      dtype='object')

### Set schema

In [31]:
df.dtypes

ID                          object
age                         object
sex                         object
city                        object
province                    object
country                     object
wuhan(0)_not_wuhan(1)       object
latitude                    object
longitude                   object
geo_resolution              object
date_onset_symptoms         object
date_admission_hospital     object
date_confirmation           object
symptoms                    object
lives_in_Wuhan              object
travel_history_dates        object
travel_history_location     object
reported_market_exposure    object
additional_information      object
chronic_disease_binary      object
chronic_disease             object
source                      object
sequence_available          object
outcome                     object
date_death_or_discharge     object
notes_for_discussion        object
location                    object
admin3                      object
admin2              

In [32]:
col_date = list(filter(lambda x:'date' in x, df.columns))
col_date.remove("travel_history_dates")
col_date

['date_onset_symptoms',
 'date_admission_hospital',
 'date_confirmation',
 'date_death_or_discharge']

In [33]:
col_admin = list(filter(lambda x:'admin' in x, df.columns))
col_float = ['age','latitude','longitude']
col_bin = ['wuhan(0)_not_wuhan(1)','chronic_disease_binary']   #sex',
col_cat = ['city','province','country','geo_resolution','location','lives_in_Wuhan',
           'outcome','reported_market_exposure','sequence_available']    #drop 'country_new'
col_str = col_admin + ['ID','chronic_disease','symptoms',
            'travel_history_location','source',
            'notes_for_discussion','additional_information']

## Raw data validation with tfdv

In [34]:
import tensorflow_data_validation as tfdv
print('TFDV version: {}'.format(tfdv.version.__version__))
print(tfdv.StatsOptions)

TFDV version: 0.21.0
<class 'tensorflow_data_validation.statistics.stats_options.StatsOptions'>


In [35]:
summary_stats = tfdv.generate_statistics_from_dataframe(df)

Most columns are inferred as categorical variables with a lot of missing values

In [37]:
tfdv.visualize_statistics(summary_stats)

In [14]:
schema = tfdv.infer_schema(summary_stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'ID',BYTES,required,,"(-inf,inf)"
'additional_information',BYTES,required,,-
'admin1',STRING,required,,'admin1'
'admin2',BYTES,required,,-
'admin3',BYTES,required,,-
'admin_id',STRING,required,,'admin_id'
'age',BYTES,required,,-
'chronic_disease',STRING,required,,'chronic_disease'
'chronic_disease_binary',STRING,required,,'chronic_disease_binary'
'city',BYTES,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'admin1',"'', 'Anhui', 'Arizona', 'Bavaria', 'Beijing', 'California', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong', 'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan', 'Hong Kong', 'Hubei', 'Hunan', 'Ile-de-France', 'Illinois', 'Inner Mongolia', 'Jiangsu', 'Jiangxi', 'Jiangxi ', 'Jilin', 'Johor', 'Khanh Hoa province', 'Lapland', 'Liaoning', 'Macau', 'Nakhon Pathom', 'New South Wales', 'Ningxia', 'Nonthaburi', 'Nouvelle-Aquitaine', 'Ontario', 'Qinghai', 'Shaanxi', 'Shandong', 'Shanghai', 'Shanxi', 'Sichuan', 'Sihanoukville Province', 'Taiwan', 'Thanh Hoa', 'Tianjin', 'Tokyo', 'Victoria', 'Washington', 'Xinjiang', 'Yunnan', 'Zhejiang'"
'admin_id',"'', 'Liangjiang New District'"
'chronic_disease',"'', '""thought to have had other pre-existing conditions""', 'N/A', 'Parkinson\'s disease for five years, taking medicine of Madopar', 'asthma', 'chronic bronchitis', 'chronic bronchitis, stenocardia, hypertension, coronary stenting, hemorrhage of digestive tract', 'coronary heart disease', 'diabetes', 'diabetes, cerebral infarction', 'diabetes, coronary heart disease', 'history of hypertension, type 2 diabetes, coronary heart disease for which a stent had been implanted, and lung cancer ', 'hypertension', 'hypertension for more than 20 years, diabetes for more than 20 years, Parkinson\'s disease', 'hypertension, cerebral infarction, encephalomalacia', 'hypertension, chronic obstructive pulmonary disease, diabetes, chronic renal insufficiency', 'hypertension, coronary heart disease, diabetes, Tuberculosis', 'hypertension, diabetes', 'hypertension, diabetes, colon cancer surgery four years ago', 'hypertension, diabetes, coronary heart disease, frequent ventricular premature beat (FVPB), coronary artery stenting', 'hypertension, hip replacement', 'hypertriglyceridemia ', 'prostate hypertrophy', 'type 2 diabetes for 30+ years; hypertension for 9 years; coronary bypass surgery for 2 years;'"
'chronic_disease_binary',"'', '0', '1', 'N/A'"
'country',"'', 'Australia', 'Belgium', 'Cambodia', 'Canada', 'China', 'Finland', 'France', 'Germany', 'India', 'Italy', 'Japan', 'Malaysia', 'Nepal', 'Philippines', 'Russia', 'Singapore', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Taiwan', 'Thailand', 'UAE', 'United Kingdom', 'United States', 'Vietnam'"
'country_new',"'', 'Australia', 'Cambodia', 'Canada', 'China', 'Finland', 'France', 'Germany', 'Japan', 'Malaysia', 'Nepal', 'Singapore', 'Singapore ', 'South Korea', 'Sri Lanka', 'Taiwan, China', 'Thailand', 'United States', 'Vietnam'"
'data_moderator_initials',"'', 'SL'"
'date_admission_hospital',"'', '01.01.2020', '01.02.2020', '02.02.2020', '02.03.2020', '03.01.2020', '03.02.2020', '04.01.2020', '04.02.2020', '05.01.2020', '05.02.2020', '06.01.2020', '08.01.2020', '09.01.2020', '10.01.2020', '10.12.2019', '11.01.2020', '12.01.2020', '13.01.2020', '14.01.2020', '15.01.2020', '16.01.2020', '17.01.2020', '18.01.2020', '18.01.2020 - 23.01.2020', '19.01.2020', '20.01.2020', '21.01.2020', '22.01.2020', '22.02.2020', '23.01.2020', '24.01.2020', '25.01.2020', '26.01.2020', '27.01.2020', '27.02.2020', '27.12.2019', '28.01.2020', '28.02.2020', '29.01.2020', '30.01.2020', '31.01.2020', '31.12.2019', '9.01.2020'"
'date_confirmation',"'', '01.02.2020', '02.02.2020', '03.02.2020', '04.02.2020', '05.02.2020', '06.02.2020', '07.02.2020', '12.01.2020', '15.01.2020', '16.01.2020', '17.01.2020', '18.01.2020', '19.01.2020', '20.01.2020', '21.01.2020', '22.01.2020', '23.01.2020', '24.01.2020', '25.01.2020', '26.01.2020', '27.01.2020', '28.01.2020', '29.01.2020', '30.01.2020', '31.01.2020', 'not sure'"
'date_death_or_discharge',"'', '01.02.2020', '02.02.2020', '02.02.2021', '02.02.2022', '09.01.2020', '15.01.2020', '16.01.2020', '17.01.2020', '18.01.2020', '19.01.2020', '20.01.2020', '21.01.2020', '22.01.2020', '23.01.2020', '24.01.2020', '27.01.2020', '29.01.2020', '31.01.2020', 'discharge'"


## Use data_clean.py to clean dates, typos, rollup categories, etc
#### Load cleaned data

In [38]:
df = read_dataframe('../data/df.feather')
df.columns

Index(['ID', 'age', 'sex', 'city', 'province', 'country',
       'wuhan(0)_not_wuhan(1)', 'latitude', 'longitude', 'geo_resolution',
       'date_onset_symptoms', 'date_admission_hospital', 'date_confirmation',
       'symptoms', 'lives_in_Wuhan', 'travel_history_dates',
       'travel_history_location', 'reported_market_exposure',
       'additional_information', 'chronic_disease_binary', 'chronic_disease',
       'source', 'sequence_available', 'outcome', 'date_death_or_discharge',
       'notes_for_discussion', 'location', 'admin3', 'admin2', 'admin1',
       'country_new', 'admin_id', 'data_moderator_initials', 'male'],
      dtype='object')

In [39]:
summary_stats_clean = tfdv.generate_statistics_from_dataframe(df.drop(columns=col_date))

In [29]:
tfdv.visualize_statistics(summary_stats_clean)

In [28]:
schema_clean = tfdv.infer_schema(summary_stats_clean)
tfdv.display_schema(schema_clean)


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'ID',BYTES,required,,"(-inf,inf)"
'additional_information',BYTES,required,,-
'admin1',STRING,required,,'admin1'
'admin2',BYTES,required,,-
'admin3',BYTES,required,,-
'admin_id',STRING,required,,'admin_id'
'age',FLOAT,optional,single,-
'chronic_disease',STRING,required,,'chronic_disease'
'chronic_disease_binary',INT,required,,-
'city',BYTES,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'admin1',"'', 'Anhui', 'Arizona', 'Bavaria', 'Beijing', 'California', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong', 'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan', 'Hong Kong', 'Hubei', 'Hunan', 'Ile-de-France', 'Illinois', 'Inner Mongolia', 'Jiangsu', 'Jiangxi', 'Jiangxi ', 'Jilin', 'Johor', 'Khanh Hoa province', 'Lapland', 'Liaoning', 'Macau', 'Nakhon Pathom', 'New South Wales', 'Ningxia', 'Nonthaburi', 'Nouvelle-Aquitaine', 'Ontario', 'Qinghai', 'Shaanxi', 'Shandong', 'Shanghai', 'Shanxi', 'Sichuan', 'Sihanoukville Province', 'Taiwan', 'Thanh Hoa', 'Tianjin', 'Tokyo', 'Victoria', 'Washington', 'Xinjiang', 'Yunnan', 'Zhejiang'"
'admin_id',"'', 'Liangjiang New District'"
'chronic_disease',"'', '""thought to have had other pre-existing conditions""', 'N/A', 'Parkinson\'s disease for five years, taking medicine of Madopar', 'asthma', 'chronic bronchitis', 'chronic bronchitis, stenocardia, hypertension, coronary stenting, hemorrhage of digestive tract', 'coronary heart disease', 'diabetes', 'diabetes, cerebral infarction', 'diabetes, coronary heart disease', 'history of hypertension, type 2 diabetes, coronary heart disease for which a stent had been implanted, and lung cancer ', 'hypertension', 'hypertension for more than 20 years, diabetes for more than 20 years, Parkinson\'s disease', 'hypertension, cerebral infarction, encephalomalacia', 'hypertension, chronic obstructive pulmonary disease, diabetes, chronic renal insufficiency', 'hypertension, coronary heart disease, diabetes, Tuberculosis', 'hypertension, diabetes', 'hypertension, diabetes, colon cancer surgery four years ago', 'hypertension, diabetes, coronary heart disease, frequent ventricular premature beat (FVPB), coronary artery stenting', 'hypertension, hip replacement', 'hypertriglyceridemia ', 'prostate hypertrophy', 'type 2 diabetes for 30+ years; hypertension for 9 years; coronary bypass surgery for 2 years;'"
'country',"'', 'Australia', 'Belgium', 'Cambodia', 'Canada', 'China', 'Finland', 'France', 'Germany', 'India', 'Italy', 'Japan', 'Malaysia', 'Nepal', 'Philippines', 'Russia', 'Singapore', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Taiwan', 'Thailand', 'UAE', 'United Kingdom', 'United States', 'Vietnam'"
'country_new',"'', 'Australia', 'Cambodia', 'Canada', 'China', 'Finland', 'France', 'Germany', 'Japan', 'Malaysia', 'Nepal', 'Singapore', 'Singapore ', 'South Korea', 'Sri Lanka', 'Taiwan, China', 'Thailand', 'United States', 'Vietnam'"
'data_moderator_initials',"'', 'SL'"
'geo_resolution',"'', '1', 'admin', 'admin ', 'point', 'point '"
'lives_in_Wuhan',"'', '0', '1', 'Chinese', 'N/A', 'Xiantao City resident', 'business trip', 'live in Hangzhou', 'lived in Wuhan for two months and then went back to Cangzhou', 'medical trip', 'no', 'no, trip to Wuhan', 'no, work in Wuhan', 'return from Wuhan', 'shanghai resident, travel history', 'study trip', 'thai national', 'tourism', 'travel', 'used to be', 'work in Wuhan', 'yes', 'yes '"
'location',"'', 'Bangkok', 'Bordeaux', 'Border Checkpoint', 'Chicago', 'Datong', 'Fuzhou', 'Hong Kong', 'Hong Mei House, Cheung Hong Estate', 'Incheon Airport', 'Ji\'an', 'Jingdezhen', 'Jiujiang', 'Kathmandu', 'Kuala Lumpur', 'Linfen City', 'Los Angeles', 'Luliang City', 'Macau, Landmark Hotel', 'Manzhouli City', 'Melbourne', 'Nanchang', 'Narita Airport', 'Nha Trang', 'Paris', 'Quzhou', 'Royal Gold Peak', 'Shuozhou City', 'Sihanoukville City', 'Singapore', 'Starnberg ', 'Sydney', 'Taoyuan International Airport', 'Tempe', 'Tokyo', 'Toronto', 'Xinyu', 'Yichun', 'Zabaykalsky Krai', 'stayed at Park Royal Collection Pickering and Oasia Hotel Downtown'"
'notes_for_discussion',"'', 'Asymptomatic but placed in quarantine: https://www.thestar.com.my/news/nation/2020/01/25/three-chinese-nationals-test-positive-for-coronavirus-in-m039sia-says-health-minister#.XiwrXIRMMhs.twitter', 'Daughter of France\'s 4th case (80 year-old man)', 'Most details from listening to LA County Public Health Dept Press Conference live', 'Travel to Cebu and Dumaguete also', 'Wuhan University Student', 'had dinner with 3 patients diagnosed on January 31', 'has been working in Wuhan for a long time', 'http://wsjkw.cq.gov.cn/tzgg/20200127/249875.html', 'https://laist.com/2020/01/26/coronavirus-orange-county-first-case-confirmed.php', 'https://mp.weixin.qq.com/s?__biz=MjM5MTg5OTM0Ng==&mid=2651448627&idx=1&sn=e26391f0e764bfd4fc2a9a1bc15502ff&chksm=bd53ce8e8a2447985560b9a62b2a5f567693ac25032b0209cc489c49d91862fd480e4664094e&token=566312186&lang=zh_CN#rd', 'https://news.163.com/special/epidemic/?spssid=7283291fcdba1d8c2d13ee3da2cfb760&spsw=7&spss=other', 'https://pr.moph.go.th/?url=pr/detail/2/04/138124/&fbclid=IwAR1PeGWAcxm62qbjqEWWbxZUfYpva3TA51PbDub62sej_pLv9RpfSOfxqMA', 'https://www.info.gov.hk/gia/general/202001/22/P2020012200982.htm', 'https://www.info.gov.hk/gia/general/202001/23/P2020012300032.htm', 'https://www.info.gov.hk/gia/general/202001/25/P2020012500676.htm', 'https://www.mhlw.go.jp/stf/newpage_08906.html', 'https://www.mhlw.go.jp/stf/newpage_09079.html', 'https://www.mhlw.go.jp/stf/newpage_09099.html', 'https://www.mhlw.go.jp/stf/newpage_09153.html', 'https://www.newsfirst.lk/2020/01/27/breaking-news-confirmed-case-of-coronavirus-in-sri-lanka/'"
