# COVID Prediction Project
### Zequn Li
-----

## 0. Project Architecture

#### Core equation:
##### Existing(t+1) = Existing(t) + NewConfirmed(t) - Death(t) - Cured(t)
#### Specifically:
##### NewConfirmed(t) = NewTested(t) * InfectionRate(t, state demographic distribution, state mobility, ?other information: population density, temperature, Existing, surrounding state existing, ...)
##### Death(t) = Existing(t+dt1) * DeathRate(state demographic)
##### Cured(t) = Existing(t+dt2) * RecoverRate(demographic)

## 1. Exploration Data Analysis

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [42]:
DeathRate = pd.read_csv("data/COVID-19_Case_Surveillance_Public_Use_Data.csv")
DeathRate.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5760066 entries, 0 to 5760065
Data columns (total 11 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   cdc_report_dt                  object
 1   pos_spec_dt                    object
 2   onset_dt                       object
 3   current_status                 object
 4   sex                            object
 5   age_group                      object
 6   Race and ethnicity (combined)  object
 7   hosp_yn                        object
 8   icu_yn                         object
 9   death_yn                       object
 10  medcond_yn                     object
dtypes: object(11)
memory usage: 483.4+ MB


In [43]:
DeathRate.head(5)

Unnamed: 0,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/06/30,,,Probable Case,Other,0 - 9 Years,Unknown,Unknown,Unknown,No,Unknown
1,2020/06/30,,,Probable Case,Other,0 - 9 Years,Unknown,Unknown,Unknown,No,Unknown
2,2020/07/05,,,Laboratory-confirmed case,Other,0 - 9 Years,Unknown,Missing,Missing,Missing,Missing
3,2020/06/10,,2020/03/27,Probable Case,Other,0 - 9 Years,Unknown,No,Unknown,No,Unknown
4,2020/07/13,,2020/04/20,Probable Case,Other,0 - 9 Years,Unknown,No,Unknown,No,Unknown


In [44]:
DeathRate["dead"] = (DeathRate["death_yn"] == "Yes").astype(float)
DeathRate = DeathRate[["sex", "age_group", "Race and ethnicity (combined)", "dead"]]
DeathRate = DeathRate[(DeathRate["sex"].isin(["Male", "Female"]))&
                        (~DeathRate["Race and ethnicity (combined)"].isin(["Unknown", "NA"]))&
                        (~DeathRate["age_group"].isin(["Unknown", "NA"]))]

In [45]:
DeathRate.groupby(["sex", "age_group", "Race and ethnicity (combined)"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dead
sex,age_group,Race and ethnicity (combined),Unnamed: 3_level_1
Female,0 - 9 Years,"American Indian/Alaska Native, Non-Hispanic",0.000000
Female,0 - 9 Years,"Asian, Non-Hispanic",0.000000
Female,0 - 9 Years,"Black, Non-Hispanic",0.000594
Female,0 - 9 Years,Hispanic/Latino,0.000351
Female,0 - 9 Years,"Multiple/Other, Non-Hispanic",0.000337
...,...,...,...
Male,80+ Years,"Black, Non-Hispanic",0.378761
Male,80+ Years,Hispanic/Latino,0.391887
Male,80+ Years,"Multiple/Other, Non-Hispanic",0.352417
Male,80+ Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",0.430233


In [47]:
stateDemo = pd.read_csv("data/sc-est2019-alldata6.csv", nrows=100)

In [49]:
stateDemo.head(5)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,SEX,ORIGIN,RACE,AGE,CENSUS2010POP,...,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019
0,40,3,6,1,Alabama,0,0,1,0,37991,...,37818,38678,37824,36812,37575,37698,37629,37526,36370,35594
1,40,3,6,1,Alabama,0,0,1,1,38150,...,38036,37719,38597,37854,36968,37882,37732,37904,37641,36679
2,40,3,6,1,Alabama,0,0,1,2,39738,...,39570,38202,37653,38528,37769,37123,38048,37975,38172,37904
3,40,3,6,1,Alabama,0,0,1,3,39827,...,39805,39690,38113,37716,38605,37896,37266,38170,38197,38510
4,40,3,6,1,Alabama,0,0,1,4,39353,...,39493,39872,39628,38122,37751,38625,37954,37322,38312,38476


In [51]:
infectRate = pd.read_csv("data/csse_covid_19_daily_reports_us/04-12-2020.csv")
infectRate.head(5)

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate
0,Alabama,US,2020-04-12 23:18:15,32.3182,-86.9023,3563,93,,3470,1.0,75.98802,21583.0,437.0,2.61016,84000001,USA,460.300152,12.264945
1,Alaska,US,2020-04-12 23:18:15,61.3707,-152.4044,272,8,66.0,264,2.0,45.504049,8038.0,31.0,2.941176,84000002,USA,1344.711576,11.397059
2,Arizona,US,2020-04-12 23:18:15,33.7298,-111.4312,3542,115,,3427,4.0,48.662422,42109.0,,3.246753,84000004,USA,578.522286,
3,Arkansas,US,2020-04-12 23:18:15,34.9697,-92.3731,1280,27,367.0,1253,5.0,49.439423,19722.0,130.0,2.109375,84000005,USA,761.753354,10.15625
4,California,US,2020-04-12 23:18:15,36.1162,-119.6816,22795,640,,22155,6.0,58.137726,190328.0,5234.0,2.81202,84000006,USA,485.423868,22.961176


In [52]:
infectRate = pd.read_csv("data/csse_covid_19_daily_reports_us/04-13-2020.csv")
infectRate.head(5)

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate
0,Alabama,US,2020-04-13 23:07:54,32.3182,-86.9023,3734,99,,3635.0,1.0,79.634933,29182.0,457.0,2.651312,84000001,USA,622.363852,12.238886
1,Alaska,US,2020-04-13 23:07:54,61.3707,-152.4044,277,8,85.0,269.0,2.0,46.340521,7830.0,32.0,2.888087,84000002,USA,1309.914362,11.552347
2,American Samoa,US,,-14.271,-170.132,0,0,,,60.0,0.0,3.0,,,16,ASM,5.391708,
3,Arizona,US,2020-04-13 23:07:54,33.7298,-111.4312,3705,122,,3583.0,4.0,50.901828,43347.0,525.0,3.292848,84000004,USA,595.530778,14.17004
4,Arkansas,US,2020-04-13 23:07:54,34.9697,-92.3731,1410,29,391.0,1381.0,5.0,54.460614,20804.0,130.0,2.056738,84000005,USA,803.545116,9.219858


In [54]:
hospital = pd.read_csv("data/State_Representative_Estimates_for_Hospital_Utilization.csv")
hospital.head(5)

Unnamed: 0,OBJECTID,state_name,total_inpatient_beds,total_icu_beds,pct_inpatient_bed_utilization,pct_inpatients_with_covid,pct_icu_bed_utilization,icu_beds_used_estimate,inpatient_beds_used_estimate,inpatient_beds_used_covid_est,last_updated
0,1,Alaska,1178.0,228.0,62.51,7.29,75.4,95.0,892.0,104.0,2020/11/06 14:45:00+00
1,2,California,60674.0,8744.0,71.6,5.6,70.22,5073.0,44587.0,3490.0,2020/11/06 14:45:00+00
2,3,Hawaii,1745.0,299.0,68.56,3.29,69.96,170.0,1700.0,81.0,2020/11/06 14:45:00+00
3,4,Idaho,3604.0,513.0,54.96,8.82,75.59,223.0,1932.0,310.0,2020/11/06 14:45:00+00
4,5,Nevada,7311.0,1152.0,75.31,10.37,75.09,642.0,5500.0,738.0,2020/11/06 14:45:00+00


In [55]:
mobility = pd.read_csv("data/Trips_by_Distance.csv")
mobility.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2117622 entries, 0 to 2117621
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   Level                           object 
 1   Date                            object 
 2   State FIPS                      float64
 3   State Postal Code               object 
 4   County FIPS                     float64
 5   County Name                     object 
 6   Population Staying at Home      object 
 7   Population Not Staying at Home  object 
 8   Number of Trips                 object 
 9   Number of Trips <1              object 
 10  Number of Trips 1-3             object 
 11  Number of Trips 3-5             object 
 12  Number of Trips 5-10            object 
 13  Number of Trips 10-25           object 
 14  Number of Trips 25-50           object 
 15  Number of Trips 50-100          object 
 16  Number of Trips 100-250         object 
 17  Number of Trips 250-500    

In [56]:
mobility.head(5)

Unnamed: 0,Level,Date,State FIPS,State Postal Code,County FIPS,County Name,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500
0,County,2019/01/01,29.0,MO,29171.0,Putnam County,1155.0,3587.0,12429.0,2807.0,3642.0,1272.0,1240.0,1953.0,1058.0,283.0,101.0,54.0,19.0
1,County,2019/01/01,2.0,AK,2164.0,Lake and Peninsula Borough,,,,,,,,,,,,,
2,County,2019/01/01,1.0,AL,1001.0,Autauga County,9624.0,45807.0,132004.0,27097.0,35263.0,18315.0,18633.0,22963.0,5149.0,2575.0,1592.0,322.0,95.0
3,County,2019/01/01,1.0,AL,1003.0,Baldwin County,44415.0,172941.0,534520.0,120752.0,142931.0,68235.0,87430.0,78045.0,24495.0,7079.0,3188.0,1693.0,672.0
4,County,2019/01/01,1.0,AL,1005.0,Barbour County,4782.0,20023.0,67658.0,15524.0,16677.0,10550.0,11674.0,6416.0,3686.0,2450.0,589.0,66.0,26.0
