Exploratory Data Analysis of COVID-19 Data

Created by: Zain Bacchus

Created on: 03-27-2020

# Import Libraries


In [185]:
import numpy as np
import pandas as pd
import plotly as py
import plotly.figure_factory as ff




# Data Imports

In [186]:
df = pd.read_csv("../Python Notebooks/covid-19-data/us-counties.csv")

df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [187]:
df.tail()

Unnamed: 0,date,county,state,fips,cases,deaths
12372,2020-03-25,Natrona,Wyoming,56025.0,6,0
12373,2020-03-25,Park,Wyoming,56029.0,1,0
12374,2020-03-25,Sheridan,Wyoming,56033.0,4,0
12375,2020-03-25,Sweetwater,Wyoming,56037.0,1,0
12376,2020-03-25,Teton,Wyoming,56039.0,6,0


In [188]:
df.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths      int64
dtype: object

In [189]:
# rows, cols
df.shape

(12377, 6)

In [190]:
duplicate_rows_df = df[df.duplicated()]
print('number of duplicate rows: ', duplicate_rows_df.shape)

number of duplicate rows:  (0, 6)


In [191]:
# check for nulls
print(df.isnull().sum())

date        0
county      0
state       0
fips      159
cases       0
deaths      0
dtype: int64


In [192]:
# Nulls are attributed to NYT grouping all 5 boroughs in NYC into one
# for the purpose of visualization will map all nulls to 36061 (Manhattan)
# reference: https://github.com/nytimes/covid-19-data/issues/6
df["fips"].fillna(36061, inplace = True)

In [193]:
# check for nulls
print(df.isnull().sum())

date      0
county    0
state     0
fips      0
cases     0
deaths    0
dtype: int64


In [194]:
# A fips code should be a 2-digit state code followed by a 3-digit county code
# reference: https://www.fcc.gov/general/form-477-census-tract-information
df.astype({'fips': 'int64'}).dtypes

date      object
county    object
state     object
fips       int64
cases      int64
deaths     int64
dtype: object

In [195]:
df.sort_values(by =['date', 'cases'], ascending=False)

Unnamed: 0,date,county,state,fips,cases,deaths
11792,2020-03-25,New York City,New York,36061.0,20011,280
11790,2020-03-25,Westchester,New York,36119.0,4691,6
11765,2020-03-25,Nassau,New York,36059.0,3285,17
11782,2020-03-25,Suffolk,New York,36103.0,2260,20
11202,2020-03-25,Cook,Illinois,17031.0,1418,13
...,...,...,...,...,...,...
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0


In [196]:
df = df[df['date'] == '2020-03-25']
df.sort_values(by =['cases'], ascending=False)

Unnamed: 0,date,county,state,fips,cases,deaths
11792,2020-03-25,New York City,New York,36061.0,20011,280
11790,2020-03-25,Westchester,New York,36119.0,4691,6
11765,2020-03-25,Nassau,New York,36059.0,3285,17
11782,2020-03-25,Suffolk,New York,36103.0,2260,20
11202,2020-03-25,Cook,Illinois,17031.0,1418,13
...,...,...,...,...,...,...
11204,2020-03-25,DeKalb,Illinois,17037.0,1,0
11205,2020-03-25,Douglas,Illinois,17041.0,1,0
11968,2020-03-25,Bradford,Pennsylvania,42015.0,1,0
11207,2020-03-25,Grundy,Illinois,17063.0,1,0


In [197]:
fips = df["fips"]
values = df["cases"]

fig = ff.create_choropleth(fips=fips, values=values)
py.iplot(fig, filename='choropleth of some cali counties - full usa scope')

ValueError: 
The create_choropleth figure factory requires the plotly-geo package.
Install using pip with:

$ pip install plotly-geo

Or, install using conda with

$ conda install -c plotly plotly-geo
