# COVID-19 Data Analysis

##  Data for Cook, Illinois

In [1]:
#Environment set-up

import matplotlib.pyplot as plot
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np

### Data Collection

In [2]:
#Retrieve data
df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')

In [3]:
#Display dataframe
df

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0
...,...,...,...,...,...,...
933493,2021-01-15,Sweetwater,Wyoming,56037.0,3262,26.0
933494,2021-01-15,Teton,Wyoming,56039.0,2605,4.0
933495,2021-01-15,Uinta,Wyoming,56041.0,1806,9.0
933496,2021-01-15,Washakie,Wyoming,56043.0,839,23.0


### Data Wrangling

In [4]:
# 'fips' column gives the specific state and county code
cook_illinois = df[(df["fips"]== 17031.0)]
cook_illinois

Unnamed: 0,date,county,state,fips,cases,deaths
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
6,2020-01-25,Cook,Illinois,17031.0,1,0.0
11,2020-01-26,Cook,Illinois,17031.0,1,0.0
16,2020-01-27,Cook,Illinois,17031.0,1,0.0
21,2020-01-28,Cook,Illinois,17031.0,1,0.0
...,...,...,...,...,...,...
917887,2021-01-11,Cook,Illinois,17031.0,419471,8811.0
921132,2021-01-12,Cook,Illinois,17031.0,422159,8841.0
924377,2021-01-13,Cook,Illinois,17031.0,424217,8870.0
927623,2021-01-14,Cook,Illinois,17031.0,426658,8889.0


In [5]:
# Display columns
cook_illinois.columns

Index(['date', 'county', 'state', 'fips', 'cases', 'deaths'], dtype='object')

In [6]:
# drop column 'fips'
pd.set_option('mode.chained_assignment', None)
drop_cols = ['fips']
cook_illinois= cook_illinois.drop(drop_cols, inplace = True, axis=1)

In [9]:
# Display new dataframe:"cook_illinois"
cook_illinois

In [8]:
# Check for missing values
cook_illinois.isnull().sum().sum()

AttributeError: 'NoneType' object has no attribute 'isnull'

In [None]:
# Round data to whole number
pd.set_option('precision', 0)
cook_illinois.round()

In [None]:
# Check if new index 'date' will be unique
cook_illinois['date'].is_unique

In [None]:
# Set 'date' as new index
cook_illinois = cook_illinois.set_index("date")
cook_illinois.head()

In [None]:
#Set new data frame, "cook_illinois1", for a range of 30 days
cook_illinois1= cook_illinois.loc['2020-04-24': '2020-05-24']
cook_illinois1

In [None]:
#Gather statistics for data
cook_illinois1.describe()

### Data Visualization

In [None]:
#Display a scatterplot of number of deaths during the 30 day range for Cook, Illinois
fig_dims = (35,15)
fig, ax = plot.subplots(figsize=fig_dims)
sns.scatterplot(x="date", y="deaths", hue="deaths", size="deaths", sizes=(20, 200), ax=ax, data=cook_illinois1)

In [None]:
#Display a bar graph with the number of cases and deaths for Cook, Illinois within the 30 days range
plot.figure(figsize=(18,8))
ax = sns.barplot(x="cases", y="deaths", data=cook_illinois1)

### Conclusions

Overtime, the number of cases and deaths gradually increased in Cook, Illinois

By 2020-05-24,the maximum number of cases was 72,010.  

## Data for New York

In [None]:
#Create a new dataframe to display all the data from New York State. 
new_york = df[(df['state'] == 'New York')]
new_york

### Data Wrangling

In [None]:
new_york.columns

In [None]:
#Drop 'fips' column
drop_cols = ['fips']
new_york.drop(drop_cols, inplace = True, axis=1)

In [None]:
#Display cleaned data set
new_york

In [None]:
#Display New York statistics
new_york.describe()

In [None]:
#Create new dataframe, "new_york30", to display date ranges from 2020-03-01 to 2020-04-01
new_york30= new_york[(new_york['date'] > '2020-03-01') & (new_york['date'] < '2020-04-01')]
new_york30

In [None]:
#Determine "new_york30" statistics
new_york30.describe()

### Data Visualization

In [None]:
#Display the number of deaths during the 30 day range for New York
fig_dims = (35,15)
fig, ax = plot.subplots(figsize=fig_dims)
sns.scatterplot(x="date", y="deaths", hue="deaths", size="deaths", sizes=(40, 200), ax=ax, data=new_york30)

### Conclusions

New York had a more drastic increase in COVID-19 cases towards the end of March than in the beginning. 

The maximum number of cases by 2020-03-31 was 43,518.

## Data for Northeast Region

In [None]:
#Create 'state' as the new index for dataframe "states"
states= df.set_index(['state'])

In [None]:
#The dataframe locates the data for New York, New Jersey, and Connecticut from the index
states= states.loc[['New York', 'New Jersey', 'Connecticut']]

In [None]:
#Display the dataframe "states"
states

### Data Wrangling

In [None]:
#Drop the column 'fips'
drop_cols = ['fips']
states.drop(drop_cols, inplace = True, axis=1)

In [None]:
#Display the total number of cases for each of the states in descending order
states.groupby(['state'])['cases'].count().sort_values(ascending=False)

### Data Visualization

In [None]:
#Display a bar graph of the average number of deaths for each state in descending order
states.groupby(['state'])['deaths'].mean().sort_values(ascending=False).plot(kind='bar')

In [None]:
#Display a graph of the total number of cadses and deaths for each state
states.plot()

In [None]:
#Display a bar graph of the total number of cases for each state 
fig, ax = plot.subplots(figsize=(6, 6))
ax.bar(states.index.values, states['cases'], color='purple')
ax.set(xlabel= "state", ylabel= "cases", title="COVID-19 Cases in the Northeast Region")

### Conclusions 

New York had the highest number of COVID-19 cases and highest number of average deaths in comparison to New Jersey and Connecticut. 

The dramatic difference in numbers can conclude that New York had the highest number of cases in the Northeast region. 
