In [1]:
# Did this push to my branch?

# Project 1 Using Python, Pandas and Matplotlib
## Comparing Monthly Alcohol Sales, COVID Infection Rates and Shutdown Measures Across 16 U.S. States From May to June 2020
## Group 3: Phil, Rob, Diane, Gurupdesh
### The information on alcohol consumption in the U.S. during the period March 2020 through June 2020 can be  found at the National Institute on Alcohol Abuse and Alcoholism website:
summary: https://pubs.niaaa.nih.gov/publications/surveillance-covid-19/COVSALES.htm
### The data file "alcsales_June2020.xlsx" was downloaded from the above page, then saved as a renamed csv file (for import into Pandas) as "NIH_Alcohol_Sales_Thru_Jun2020.csv"
data file: https://pubs.niaaa.nih.gov/publications/surveillance-covid-19/alcsales_June2020.xlsx
### Definitions
Updated 08/27/2020
This file contains data on per capita alcohol sales from 16 states: Alaska, Arkansas, Colorado, Connecticut, Florida, Illinois, Kansas, Kentucky, Louisiana, Massachusetts, Missouri, North Dakota, Oregon, Texas, Virginia, and Wisconsin by type of alcoholic beverage from January 2017 through June 2020. Note: Monthly data are not available for spirits in Oregon, wine in Kansas, and beer in Louisiana and Wisconsin. This file includes	updates to sales estimates from the previous May 2020 file. Figures were rounded for display purposes after calculation.

For an understanding of the methods used in calculating per capita gallons of ethanol (pure alcohol), see: Slater, M.E., and Alpert, H.R., 2020. Surveillance Report #115: Apparent Per Capita Alcohol Consumption: National, State, and Regional Trends, 1977-2018. Bethesda, MD: NIAAA. Available to download in PDF and HTM formats at: http://pubs.niaaa.nih.gov/publications/surveillance.htm

In [1]:
# Import Dependencies
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Create reference to the csv(s) and print file names/locations and link to original data (print url link)
covid_tracking_path = "Data/COVID_Tracking_By_State_Thru_2020_10_09.csv"
print(f"COVID Tracking by State Data: {covid_tracking_path}, URL: ")
nih_alcohol_path = "Data/NIH_Alcohol_Sales_Thru_Jun2020.csv"
print(f"NIH Alcohol Consumption Data: {nih_alcohol_path}, URL: ")

# Need to create a csv data set/file for the New York Times shutdown by state information and develop a scoring methodology
# Print url/link to console/terminal

COVID Tracking by State Data: Data/COVID_Tracking_By_State_Thru_2020_10_09.csv, URL: 
NIH Alcohol Consumption Data: Data/NIH_Alcohol_Sales_Thru_Jun2020.csv, URL: 


In [3]:
# Import csv files into Pandas data frame
covid_df = pd.read_csv(covid_tracking_path)

# Print column headers of the dataframe
# covid_df.columns
covid_df.head()

Unnamed: 0,date,state,dataQualityGrade,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,...,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease
0,10/9/2020,AK,A,60.0,60.0,0,,,,51.0,...,493070.0,1899,,,,,,0,493070.0,1899
1,10/9/2020,AL,A,2653.0,2496.0,16,157.0,17989.0,17989.0,816.0,...,1196452.0,11742,,,59957.0,,,0,1196452.0,11742
2,10/9/2020,AR,A+,1503.0,1359.0,0,144.0,5805.0,5805.0,546.0,...,1111526.0,0,,21856.0,,25538.0,,0,1111526.0,0
3,10/9/2020,AS,D,0.0,,0,,,,,...,1616.0,0,,,,,,0,1616.0,0
4,10/9/2020,AZ,A+,5746.0,5460.0,3,286.0,20199.0,20199.0,706.0,...,1543445.0,12350,300333.0,,,,1543445.0,12350,,0


In [4]:
# Rename columns for readability
covid_df = covid_df.rename(columns={
    'date':'Date', 
    'state':'State',
    'dataQualityGrade':'Data Quality Grade',
    'death':'Deaths',
    'deathConfirmed':'Deaths Confirmed',
    'deathIncrease':'Increase in Deaths',
})
covid_df.head()

Unnamed: 0,Date,State,Data Quality Grade,Deaths,Deaths Confirmed,Increase in Deaths,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,...,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease
0,10/9/2020,AK,A,60.0,60.0,0,,,,51.0,...,493070.0,1899,,,,,,0,493070.0,1899
1,10/9/2020,AL,A,2653.0,2496.0,16,157.0,17989.0,17989.0,816.0,...,1196452.0,11742,,,59957.0,,,0,1196452.0,11742
2,10/9/2020,AR,A+,1503.0,1359.0,0,144.0,5805.0,5805.0,546.0,...,1111526.0,0,,21856.0,,25538.0,,0,1111526.0,0
3,10/9/2020,AS,D,0.0,,0,,,,,...,1616.0,0,,,,,,0,1616.0,0
4,10/9/2020,AZ,A+,5746.0,5460.0,3,286.0,20199.0,20199.0,706.0,...,1543445.0,12350,300333.0,,,,1543445.0,12350,,0


In [5]:
# Print values for 'state'
# covid_df['state'].value_counts()

In [8]:
# DATA CLEANING: Replace 'state' abbreviation with State names for readability
# Start with the 16 states that have alcohol consumption data
covid_df['State'] = covid_df['State'].replace({
    'AK':'Alaska',
    'AR':'Arkansas',
    'CO':'Colorado',
    'CT':'Connecticut',
    'FL':'Florida',
    'IL':'Illinois',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'MA':'Massachusetts',
    'MO':'Missouri',
    'ND':'North Dakota',
    'OR':'Oregon',
    'TX':'Texas',
    'VA':'Virginia',
    'WI':'Wisconsin',
    'WA':'Washington',
    'NJ':'New Jersey',
    'MI':'Michigan',
    'RI':'Rhode Island',
    'AZ':'Arizona',
    'HI':'Hawaii',
    'NC':'North Carolina',
    'CA':'California',
    'GA':'Georgia',
    'SC':'South Carolina',
    'NY':'New York',
    'NH':'New Hampshire',
    'MD':'Maryland',
    'NV':'Nevada',
    'TN':'Tennesee',
    'NE':'Nebraska',
    'OH':'Ohio',
    'DC':'District of Columbia',
    'IA':'Iowa',
    'NM':'New Mexico',
    'WV':'West Virginia',
    'IN':'Indiana',
    'MS':'Mississippi',
    'MN':'Minnesota',
    'PA':'Pennsylvania',
    'VT':'Vermont',
    'DE':'Delaware',
    'MT':'Montana',
    'AL':'Alabama',
    'ME':'Maine',
    'WY':'Wyoming',
    'UT':'Utah',
    'ID':'Idaho',
    'OK':'Oklahoma',
    'SD':'South Dakota',
    'VI':'Virgin Islands',
    'AS':'American Samoa',
    'GU':'Guam',
    'MP':'Northern Mariana Islands',
    'PR':'Puerto Rico'    
    }
)
covid_df['State'].value_counts()

Washington                  262
Massachusetts               262
New Jersey                  243
Virginia                    226
Michigan                    223
Rhode Island                223
Illinois                    220
North Carolina              220
Florida                     220
New York                    220
Texas                       220
New Hampshire               220
Oregon                      220
California                  220
Arizona                     220
South Carolina              220
Hawaii                      220
Georgia                     220
Wisconsin                   220
Colorado                    219
District of Columbia        219
Maryland                    219
Tennesee                    219
Nevada                      219
Nebraska                    219
Ohio                        219
Pennsylvania                218
Vermont                     218
Minnesota                   218
Arkansas                    218
West Virginia               218
Alaska  

In [9]:
# Import csv files into Pandas data frame
alcohol_df = pd.read_csv(nih_alcohol_path)

# Print column headers of the dataframe
alcohol_df.columns

Index(['Year', 'Month', 'FIPS', 'Beverage', 'Gallons', 'Ethanol', 'Population',
       'PerCapita', 'PerCapita3yr', 'PctChange'],
      dtype='object')

In [10]:
# Print values for 'FIPS' to show the codes used to identify States
alcohol_df['FIPS'].value_counts()

48    126
38    126
29    126
25    126
21    126
17    126
12    126
5     126
2     126
51    125
8     123
9     120
55     84
41     84
22     84
20     84
Name: FIPS, dtype: int64

In [11]:
# DATA CLEANING: Replace FIPS Code with State names for readability
alcohol_df['FIPS'] = alcohol_df['FIPS'].replace({
    2:'Alaska',
    5:'Arkansas',
    8:'Colorado',
    9:'Connecticut',
    12:'Florida',
    17:'Illinois',
    20:'Kansas',
    21:'Kentucky',
    22:'Louisiana',
    25:'Massachusetts',
    29:'Missouri',
    38:'North Dakota',
    41:'Oregon',
    48:'Texas',
    51:'Virginia',
    55:'Wisconsin'
    }
)
alcohol_df.head()

Unnamed: 0,Year,Month,FIPS,Beverage,Gallons,Ethanol,Population,PerCapita,PerCapita3yr,PctChange
0,2017,1,Alaska,1,103645,42598,593378,0.0718,,
1,2017,1,Arkansas,1,402885,165586,2455022,0.0674,,
2,2017,1,Colorado,1,733857,301615,4634346,0.0651,,
3,2017,1,Connecticut,1,412100,169373,3015481,0.0562,,
4,2017,1,Florida,1,2770686,1138752,17722275,0.0643,,


In [12]:
# DATA CLEANING: Replace Beverage code with Beverage names for readability
alcohol_df['Beverage'] = alcohol_df['Beverage'].replace({
    1:'Spirits',
    2:'Wine',
    3:'Beer'
    }
)
alcohol_df.head()

Unnamed: 0,Year,Month,FIPS,Beverage,Gallons,Ethanol,Population,PerCapita,PerCapita3yr,PctChange
0,2017,1,Alaska,Spirits,103645,42598,593378,0.0718,,
1,2017,1,Arkansas,Spirits,402885,165586,2455022,0.0674,,
2,2017,1,Colorado,Spirits,733857,301615,4634346,0.0651,,
3,2017,1,Connecticut,Spirits,412100,169373,3015481,0.0562,,
4,2017,1,Florida,Spirits,2770686,1138752,17722275,0.0643,,


In [13]:
# DATA CLEANING: Replace Month code with Month names for readability
alcohol_df['Month'] = alcohol_df['Month'].replace({
    1:'Jan',
    2:'Feb',
    3:'Mar',
    4:'Apr',
    5:'May',
    6:'Jun',
    7:'Jul',
    8:'Aug',
    9:'Sep',
    10:'Oct',
    11:'Nov',
    12:'Dec'
    }
)
alcohol_df.head()

Unnamed: 0,Year,Month,FIPS,Beverage,Gallons,Ethanol,Population,PerCapita,PerCapita3yr,PctChange
0,2017,Jan,Alaska,Spirits,103645,42598,593378,0.0718,,
1,2017,Jan,Arkansas,Spirits,402885,165586,2455022,0.0674,,
2,2017,Jan,Colorado,Spirits,733857,301615,4634346,0.0651,,
3,2017,Jan,Connecticut,Spirits,412100,169373,3015481,0.0562,,
4,2017,Jan,Florida,Spirits,2770686,1138752,17722275,0.0643,,


In [16]:
# Rename 'FIPS' to 'State' and 'Beverage' to 'Beverage Type' for Readability
alcohol_df = alcohol_df.rename(columns={
    "FIPS":"State", 
    "Beverage":"Beverage Type"
})
alcohol_df.head()

Unnamed: 0,Year,Month,State,Beverage Type,Gallons,Ethanol,Population,PerCapita,PerCapita3yr,PctChange
0,2017,Jan,Alaska,Spirits,103645,42598,593378,0.0718,,
1,2017,Jan,Arkansas,Spirits,402885,165586,2455022,0.0674,,
2,2017,Jan,Colorado,Spirits,733857,301615,4634346,0.0651,,
3,2017,Jan,Connecticut,Spirits,412100,169373,3015481,0.0562,,
4,2017,Jan,Florida,Spirits,2770686,1138752,17722275,0.0643,,


In [None]:
# Merge covid_df and alcohol_df on 'State'

In [None]:
# Filter dataframe to only "my" states: Florida, Illinois, Kansas and Kentucky (?)

In [None]:
# 