# **Exploration of COVIDActNow data**

---

### *An introductory data collection and analysis project on COVID-19 data sourced from COVIDActNow.org*


Tara Ahuja, Jonathan Arp, Jesse Delacruz Jr., Derrick Ngo

---

The following code pulls data from the API and creates our initial dataframe and .csv file that each individual references for analysis.

In [1]:
#import dependencies
import requests
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
from config import covid_api

In [2]:
#setup base url and perform initial get request
covid_base_url = 'https://api.covidactnow.org/v2/states.json?apiKey='+covid_api
covid_json = requests.get(covid_base_url).json()

In [3]:
#create dataframe using concat
states_df = pd.DataFrame()
for state in covid_json:
    state_temp = pd.DataFrame({
        'State':[state['state']],
        'Population':[state['population']],
        
        'Vaccination Ratio':[state['metrics']['vaccinationsInitiatedRatio']],
        'Weekly New Cases (per 100k)':[state['metrics']['weeklyNewCasesPer100k']],
        'Infection Rate':[state['metrics']['infectionRate']],
        
        'Cases':[state['actuals']['cases']],
        'Deaths':[state['actuals']['deaths']],
        'Hospital Beds':[state['actuals']['hospitalBeds']['capacity']],
        'Covid ICU Usage':[state['actuals']['hospitalBeds']['currentUsageCovid']]
    })
    #add new row
    states_df=pd.concat([
        states_df,
        state_temp
    ], ignore_index=True)

In [None]:
#export dataframe to csv
states_df.to_csv('StatesCleaned.csv')

### **The following code is Jonathan's work to discuss 'Which states have really struggled and which have done relatively well in the United States across major statistics?' and 'Are there major differences in the extremes between the states or are they relatively similar?'**

In [None]:
#import data and create new column 'Death Rate' (Deaths/Cases)
data_path = "StatesCleaned.csv"
data_df= pd.read_csv(data_path)
data_df['Death Rate'] = data_df["Deaths"] / data_df["Cases"]

In [None]:
#sort values by highest and lowest vaccine rates
highest_vaccination_rates = data_df.sort_values("Vaccination Ratio", ascending=False)
top_5_vaccinated = highest_vaccination_rates[1:6]

lowest_vaccination_rates = data_df.sort_values("Vaccination Ratio")
bottom_5_vaccinated = lowest_vaccination_rates[0:5]

#create plots
plt.figure(figsize= (4,3))
plt.bar(bottom_5_vaccinated["State"], bottom_5_vaccinated["Vaccination Ratio"])
plt.bar("USA", data_df["Vaccination Ratio"].mean())
plt.bar(top_5_vaccinated["State"], top_5_vaccinated["Vaccination Ratio"])
plt.xticks(rotation= 45)
plt.ylim(.5, 1)
plt.title("Bottom 5 & Top 5 Vaccinated States")
plt.xlabel("State")
plt.ylabel("Vaccination Ratio")
plt.show()

In [None]:
#sort values by highest and lowest death rates
highest_death_rates = data_df.sort_values("Death Rate", ascending=False)
top_5_death_rates = highest_death_rates[0:5]
top_5_death_rates

lowest_death_rates = data_df.sort_values("Death Rate")
bottom_5_death_rates = lowest_death_rates[1:6]
bottom_5_death_rates

#create plots
plt.figure(figsize= (4,3))
plt.bar(bottom_5_death_rates["State"], bottom_5_death_rates["Death Rate"])
plt.bar("USA", data_df["Death Rate"].mean())
plt.bar(top_5_death_rates["State"], top_5_death_rates["Death Rate"])
plt.xticks(rotation= 45)
plt.title("Bottom 5 & Top 5 Death Rates per State")
plt.xlabel("State")
plt.ylabel("Death Rate")
plt.show()

In [None]:
#sort values by highest and lowest weekly new cases
highest_weekly_cases = data_df.sort_values("Weekly New Cases (per 100k)", ascending=False)
top_5_weekly_cases = highest_weekly_cases[1:6]
top_5_weekly_cases

lowest_weekly_cases = data_df.sort_values("Weekly New Cases (per 100k)")
bottom_5_weekly_cases = lowest_weekly_cases[0:5]
bottom_5_weekly_cases

#create plots
plt.figure(figsize= (4,3))
plt.bar(bottom_5_weekly_cases["State"], bottom_5_weekly_cases["Weekly New Cases (per 100k)"])
plt.bar("USA", data_df["Weekly New Cases (per 100k)"].mean())
plt.bar(top_5_weekly_cases["State"], top_5_weekly_cases["Weekly New Cases (per 100k)"])
plt.xticks(rotation= 45)
plt.ylim(0, max(top_5_weekly_cases["Weekly New Cases (per 100k)"])+ 50)
plt.title("Bottom 5 & Top 5 Weekly New Cases (per 100k) per State")
plt.xlabel("State")
plt.ylabel("Weekly New Cases (per 100k)")
plt.show()

In [None]:
#sort values by highest and lowest infection rates
highest_infection_rates = data_df.sort_values("Infection Rate", ascending=False)
top_5_infection_rates = highest_infection_rates[0:5]
top_5_infection_rates

lowest_infection_rates = data_df.sort_values("Infection Rate")
bottom_5_infection_rates = lowest_infection_rates[0:5]
bottom_5_infection_rates

#create plots
plt.figure(figsize= (4,3))
plt.bar(bottom_5_infection_rates["State"], bottom_5_infection_rates["Infection Rate"])
plt.bar("USA", data_df["Infection Rate"].mean())
plt.bar(top_5_infection_rates["State"], top_5_infection_rates["Infection Rate"])
plt.xticks(rotation= 45)
plt.ylim(.9, 1.4)
plt.title("Bottom 5 & Top 5 Infection Rates per State")
plt.xlabel("State")
plt.ylabel("Infection Rate")
plt.show()

### **The following code is Derrick's work to take user input for two states and a comparison statistic. Later edits also compared those two states across all metrics.**

In [8]:
#import data from StatesCleaned.csv and drop extra index column
states_data = pd.read_csv('StatesCleaned.csv')
states_data = states_data.drop('Unnamed: 0',axis=1)

In [None]:
#create a function to prompt for user input
def prompt():
    #ask user for which two states to compare
    state1 = input("Enter the two-letter abbreviation of the first state you'd like to compare: ")
    state2 = input("Enter the two-letter abbreviation of the second state you'd like to compare: ")

    #create list of selectable metrics (from the dataframe columns)
    metrics_list = ''
    index = 1
    for metric_name in states_data.columns[2:]:
        metrics_list = metrics_list+str(index)+': '+str(metric_name)+',\n' 
        index+=1
        
    #ask user for desired comparison metric
    metric_index = input(f"Select the metric (type the number) you'd like to compare {state1} and {state2} on: "+metrics_list)

    #return user input as a list
    return [state1,state2,metric_index]

In [None]:
#prompt user and check for input errors
while True:
    try:
        inputs = prompt()
        #check if metric index is within bounds
        column_name = states_data.columns[int(inputs[2])+1]
        #boolean evaluation for correct state abbreviations
        state_check = (states_data.loc[states_data['State']==inputs[0]].any().any()) and (states_data.loc[states_data['State']==inputs[1]].any().any())
        if ~state_check:
            raise Exception
    except:
        #if any input is incorrectly formatted (not two-letter abbreviation, or not a possible column number), retry input
        print('Those inputs were not accepted. Retrying... ')
    else:
        #if inputs are acceptable, move on
        print('Inputs accepted...')
        break

This code creates the desired visualization from the user's input. After taking two state abbreviations and a selected metric (column) from the user, a graph comparing the stats for that metric is created.

In [1]:
#access the requested data in the dataframe and present it in a dual bar graph
x = [inputs[0], inputs[1]]
y = [states_data.loc[states_data['State']==inputs[0]][column_name].values[0],states_data.loc[states_data['State']==inputs[1]][column_name].values[0]]

#create visualization, set attributes, save image, and display
plt.figure(facecolor='white')
plt.bar(x,y,edgecolor='black')

plt.xlabel('States')
plt.ylabel(column_name)
initial_yticks = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.1f}'.format(x) for x in initial_yticks])
plt.title(f'{column_name} for {inputs[0]} and {inputs[1]}')

plt.tight_layout()
plt.savefig('singlecomparison.png')

plt.show()

NameError: name 'inputs' is not defined

This next block takes the same states from the earlier input, and plots ALL of the metrics
This block achieves this using a single loop, standardizing the plots. This results in a less-than-ideal representation/scaling of the graphs side-by-side, but completes this process with less code than individualized/specific data parsing.

In [None]:
#compare ALL metrics for chosen states
x = [inputs[0], inputs[1]]
column_index = 2

#create visualization and set attributes
tuple_axs = ('ax1','ax2','ax3','ax4','ax5','ax6','ax7')
fig, tuple_axs = plt.subplots(1,7, sharex = True)
fig.set_facecolor('white')
fig.suptitle(f'Comparison between {inputs[0]} and {inputs[1]}', fontsize=16, fontweight='bold')
fig.set_size_inches(35,5)

# loop through all metrics and create a standardized plot for each
for ax in tuple_axs:
    #retrieve data for each metric
    column_name = states_data.columns[column_index]
    y = [states_data.loc[states_data['State']==inputs[0]][column_name].values[0],states_data.loc[states_data['State']==inputs[1]][column_name].values[0]]
    
    #create plot, set attributes
    ax.bar(x,y,edgecolor='black', color = ['sandybrown','dodgerblue'])
    ax.set_ylabel = states_data.columns[column_index]
    initial_yticks = ax.get_yticks()
    ax.set_yticklabels(['{:,.1f}'.format(x) for x in initial_yticks])
    ax.set_title(f'{column_name}')

    #increment index tracker
    column_index += 1

#save and display graphic
fig.tight_layout()

plt.savefig('allmetrics.png')
plt.show()

### **The following code is Jesse's work to answer 'What is the correlation between Cases Reported to Vaccines Administered?'**

In [4]:
#create dataframe with vaccine info
vaccines_df = pd.DataFrame()
for state in covid_json:
    state_temp = pd.DataFrame({
        'State':[state['state']],

        'Vaccines Initiated (1st Dose)':[state['actuals']['vaccinationsInitiated']],
        'Vaccinations Completed (1st & 2nd Dose)':[state['actuals']['vaccinationsCompleted']],
        'Vaccinations Additional Dose (1st, 2nd Dose & Booster)':[state['actuals']['vaccinationsAdditionalDose']],
        'Vaccines Administered':[state['actuals']['vaccinesAdministered']],
        'Vaccines Distributed':[state['actuals']['vaccinesDistributed']]
    })
    vaccines_df=pd.concat([
        vaccines_df,
        state_temp
    ], ignore_index=True)
#merge vaccine info with other covid data
states_df3 = states_df.merge(vaccines_df,how='left', on='State')

In [5]:
#output vaccine data to csv
states_df3.to_csv('StatesCleaned_Vaccine.csv')

In [6]:
# Import the Clean Vaccine dataset, drop missing data 'NaN' found in Vaccine Counts
vacc_data = pd.read_csv('StatesCleaned_Vaccine.csv')
# drops n/a in columns
vacc_data = vacc_data.drop('Unnamed: 0',axis=1)
vacc_data = vacc_data.dropna()
#vacc_data

Unnamed: 0,State,Population,Vaccination Ratio,Weekly New Cases (per 100k),Infection Rate,Cases,Deaths,Hospital Beds,Covid ICU Usage,Vaccines Initiated (1st Dose),Vaccinations Completed (1st & 2nd Dose),"Vaccinations Additional Dose (1st, 2nd Dose & Booster)",Vaccines Administered,Vaccines Distributed
0,AK,731545,0.726,67.5,1.0,300732,1390,1875.0,48.0,531132.0,474396.0,231325.0,1295275.0,2036055.0
1,AL,4903185,0.647,118.4,1.34,1552527,21164,13527.0,441.0,3173348.0,2594922.0,1019451.0,6888090.0,11790000.0
2,AR,3017804,0.695,116.9,1.21,974385,12640,7372.0,315.0,2096050.0,1707152.0,750542.0,4768177.0,7923850.0
3,AZ,7278717,0.768,219.6,1.22,2352189,31824,14720.0,1142.0,5586984.0,4777579.0,2353283.0,14161294.0,18753570.0
4,CA,39512223,0.841,201.0,1.31,11620250,100197,63241.0,4990.0,33228217.0,29332815.0,17031375.0,85662449.0,113995935.0
5,CO,5758736,0.832,121.7,1.02,1718247,13724,10341.0,437.0,4790419.0,4210992.0,2402801.0,12682240.0,16590225.0
6,CT,3565287,0.95,114.1,1.11,931957,11603,7505.0,573.0,3620182.0,2944000.0,1618778.0,8801850.0,11275525.0
8,DE,973764,0.867,119.3,1.13,318567,3184,2569.0,198.0,844064.0,701410.0,347717.0,2023149.0,3123235.0
9,FL,21477737,0.82,72.0,1.34,7223005,83200,53653.0,2002.0,17603034.0,14833950.0,6451166.0,41294081.0,58367165.0
10,GA,10617423,0.679,91.4,1.43,2867072,39748,21494.0,1176.0,7209887.0,6042815.0,2611542.0,16665029.0,27215605.0


In [None]:
# Create a scatter plot for cases vs vaccines
#fig1, ax1 = plt.subplots(figsize=(10,8))
x_values = vacc_data['Vaccines Initiated (1st Dose)']
x2_values = vacc_data['Vaccinations Completed (1st & 2nd Dose)']
x3_values = vacc_data['Vaccinations Additional Dose (1st, 2nd Dose & Booster)']
y_values = vacc_data['Cases']
fig1, ax1 = plt.subplots(figsize=(10,8))
plt.scatter(x_values, y_values,alpha = 0.50, edgecolors = "g", linewidths = 3)
plt.scatter(x2_values, y_values,alpha = 0.50, edgecolors = "r", linewidths = 3)
plt.scatter(x3_values, y_values,alpha = 0.50, edgecolors = "b", linewidths = 3)
plt.xlabel('Vaccines Administered')
plt.ylabel('Cases')
plt.title("Vaccinations Administered (by Dose) vs Cases")
plt.savefig("scatter_allvaccines_cases.png", bbox_inches="tight")
plt.show()

In [None]:
# The Pearson correlation coefficient between "1st Vaccine Dose" and "Cases"
cases = vacc_data.iloc[:,5]
first_dose = vacc_data.iloc[:,9]
correlation = st.pearsonr(cases,first_dose)
print(f"The correlation between both factors is {round(correlation[0],2)}")

In [None]:
# VACCINE PIE
vaccines = ["1st Dose", "2nd Dose", "Booster"]
actuals = [4410896, 4040479, 2517728]
colors = ["yellowgreen", "red", "lightskyblue"]
explode = (0.05, 0, 0)

plt.title("Vaccine Dose Breakdown of Population")
plt.pie(actuals, explode=explode, labels=vaccines, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=90)
plt.axis("equal")
plt.savefig("vacc_dose_pop_pie.png")
plt.show()

### **The following code is Tara's work to answer this question:...**