In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

%matplotlib inline

In [53]:
pi = pd.read_csv("./korea_dataset/PatientInfo.csv")
pr = pd.read_csv("./korea_dataset/PatientRoute.csv")

In [54]:
d_disease = {"TRUE": 0, "FALSE": 1}
d_gender = {"male": 0, "female": 1, "n/a": 2}
d_state = {"released": 0, "isolated": 1, "deceased": 2}

In [55]:
pi["disease"].fillna("FALSE", inplace=True)
pi["disease"] = pi["disease"].map(d_disease)

for count, disease in enumerate(pi["disease"], 0):
    if math.isnan(disease):
        pi.at[count, "disease"] = "1.0"
        
pi["sex"].fillna("n/a", inplace=True)
pi["sex"] = pi["sex"].map(d_gender)

pi["state"] = pi["state"].map(d_state)

In [56]:
# Converts string date to 
def nth_day(d, format="%m/%d/%y"):
    d = pd.to_datetime(d, format=format)
    day = pd.Timestamp(year=d.year, month=1, day=1)
    return (d-day).days + 1

In [57]:
for count, cd in enumerate(pi["confirmed_date"], 0):
    pi.at[count, "confirmed_date"] = int(nth_day(str(cd)))

In [58]:
today = date.today().strftime("%m/%d/%y")
pi["released_date"].fillna(today, inplace=True)
tdy = nth_day(str(today))

for count, rd in enumerate(pi["released_date"], 0):
    pi.at[count, "released_date"] = int(nth_day(str(rd)))

rd_avg = 0
count = 0
for count, rd in enumerate(pi["released_date"], 0):
    if (int(rd) != int(tdy)):
        rd_avg += rd
        count += 1

rd_avg /= count

for count, rd in enumerate(pi["released_date"], 0):
    if (int(rd) == int(tdy)):
        pi.at[count, "released_date"] = int(rd_avg)

In [59]:
y = pi["released_date"].ravel()

In [60]:
d_infection_case = {}
d_city = {}
d_province = {}
d_country = {}

infection_case_counter = 0
city_counter = 0
province_counter = 0
country_counter = 0

pi["infection_case"].fillna("etc", inplace=True)
pi["city"].fillna("etc", inplace=True)
pi["country"].fillna("etc", inplace=True)

for case in pi["infection_case"]:
    if case not in d_infection_case:
        d_infection_case[str(case)] = infection_case_counter
        infection_case_counter += 1

# TODO: Modify - add props. -gu, -gun, -si with regex
for city in pi["city"]:
    if city not in d_city:
        d_city[str(city)] = city_counter
        city_counter += 1

# TODO: Modify - add props. Special City / Metropolitan City / Province(-do) with regex
for province in pi["province"]:
    if province not in d_province:
        d_province[str(province)] = province_counter
        province_counter += 1

for country in pi["country"]:
    if country not in d_country:
        d_country[str(country)] = country_counter
        country_counter += 1

pi["infection_case"] = pi["infection_case"].map(d_infection_case)
pi["city"] = pi["city"].map(d_city)
pi["province"] = pi["province"].map(d_province)
pi["country"] = pi["country"].map(d_country)

In [62]:
year = date.today().year

# 0 - indicates nan
pi["age"].fillna(0, inplace=True)
pi["birth_year"].fillna(0, inplace=True)

for count, age in enumerate(pi["age"], 0):
    birth = pi["birth_year"][count]
    
    if age == 0 and birth == 0:
        continue
    elif age != 0 and birth == 0:
        # No birth date recorded
        age = age[:-1] 
        born = int(year-int(age))
        pi.at[count, "birth_year"] = born
    else:
        n_age = int(year-int(birth))
        pi.at[count, "age"] = n_age

ci = pi[pi["age"] != 0]
ci = pi[pi["birth_year"] != 0]

print(ci.head())
a_avg = int(ci["age"].values.mean())
b_avg = int(ci["birth_year"].values.mean())

for count, age in enumerate(pi["age"], 0):
    birth = pi["birth_year"][count]
    
    if age == 0 and birth == 0:
        pi.at[count, "age"] = str(a_avg)
        pi.at[count, "birth_year"] = str(b_avg)

   patient_id  global_num  sex  birth_year age  country  province  city  \
0  1000000001         2.0    0      1964.0  56        0         0     0   
1  1000000002         5.0    0      1987.0  33        0         0     1   
2  1000000003         6.0    0      1964.0  56        0         0     2   
3  1000000004         7.0    0      1991.0  29        0         0     3   
4  1000000005         9.0    1      1992.0  28        0         0     4   

   disease  infection_case  infection_order   infected_by  contact_number  \
0      1.0               0              1.0           NaN            75.0   
1      1.0               0              1.0           NaN            31.0   
2      1.0               1              2.0  2.002000e+09            17.0   
3      1.0               0              1.0           NaN             9.0   
4      1.0               1              2.0  1.000000e+09             2.0   

  symptom_onset_date confirmed_date released_date deceased_date  state  
0            

In [63]:
pi.drop("global_num", axis=1, inplace=True)
pi.drop("infection_order", axis=1, inplace=True)
pi.drop("symptom_onset_date", axis=1, inplace=True)
pi.drop("infected_by", axis=1, inplace=True)
pi.drop("contact_number", axis=1, inplace=True)
pi.drop("deceased_date", axis=1, inplace=True)
pi.drop("patient_id", axis=1, inplace=True)
pi.drop("released_date", axis=1, inplace=True)
pi.drop("birth_year", axis=1, inplace=True)
pi.drop("disease", axis=1, inplace=True)

In [65]:
pi.head(10)
#sns.lineplot(x="days since 1/1/2020", y="number of cases", data=pi["confirmed_date"])

Unnamed: 0,sex,age,country,province,city,infection_case,confirmed_date,state
0,0,56,0,0,0,0,23,0
1,0,33,0,0,1,0,30,0
2,0,56,0,0,2,1,30,0
3,0,29,0,0,3,0,30,0
4,1,28,0,0,4,1,31,0
5,1,54,0,0,2,1,31,0
6,0,25,0,0,2,1,31,0
7,0,28,0,0,5,0,33,0
8,0,37,0,0,6,0,36,0
9,1,60,0,0,4,1,36,0
