# Vaccine Data Process


__INPUT__ <br/>
- what is the input data 
- cumulative in nature 

__Core Idea__ <br/>
- feature engineering to get a simpler data which translates this knowledge 

__Feature Engineering Idea__ <br/>
- sampling at 4 different points 

__Dealing with missing data__ <br/>
- some data was missing // 15th exactly not available so pick +-10

__OUTPUT__ <br/>
- Show it baby!!!



In [1]:
# downloading the dataset 
!curl -l https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/us_state_vaccinations.csv > ../dataset/us_vac_og.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2252k  100 2252k    0     0  2582k      0 --:--:-- --:--:-- --:--:-- 2579k


In [2]:
import pandas as pd
from datetime import datetime
import datetime as datetime_og


In [3]:
vacDF = pd.read_csv('../dataset/us_vac_og.csv',parse_dates=['date'])

In [4]:
vacDF.head()

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.45,7.69,,,,0.207
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.53,7.73,5906.0,5906.0,1205.0,0.222
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.28,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226
4,2021-01-16,Alabama,,,,,,,,,,7498.0,1529.0,


In [5]:
len(vacDF)

21860

In [6]:
vacDF.columns

Index(['date', 'location', 'total_vaccinations', 'total_distributed',
       'people_vaccinated', 'people_fully_vaccinated_per_hundred',
       'total_vaccinations_per_hundred', 'people_fully_vaccinated',
       'people_vaccinated_per_hundred', 'distributed_per_hundred',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'daily_vaccinations_per_million', 'share_doses_used'],
      dtype='object')

# Processing a State 

In [7]:
# Picking a state 
states = vacDF['location'].unique()

state_picked = states[6]
print(f'State Picked : {state_picked}')

State Picked : California


In [8]:
state_df = vacDF[vacDF['location']==state_picked]
state_df.sort_values(by='date')

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
2022,2021-01-12,California,816301.0,3286050.0,703540.0,0.25,2.07,100089.0,1.78,8.32,,,,0.248
2023,2021-01-13,California,891489.0,3435650.0,744545.0,0.34,2.26,133689.0,1.88,8.70,75188.0,75188.0,1903.0,0.259
2024,2021-01-14,California,975293.0,3540175.0,801998.0,,2.47,,2.03,8.96,83804.0,79496.0,2012.0,0.275
2025,2021-01-15,California,1072959.0,3548575.0,865387.0,0.52,2.72,204374.0,2.19,8.98,97666.0,85553.0,2165.0,0.302
2026,2021-01-16,California,,,,,,,,,,88381.0,2237.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2354,2021-12-10,California,61965503.0,72676305.0,31730029.0,64.50,156.83,25484575.0,80.30,183.93,272440.0,239989.0,6074.0,0.853
2355,2021-12-11,California,62237831.0,73130425.0,31797338.0,64.63,157.52,25538138.0,80.47,185.08,272328.0,235451.0,5959.0,0.851
2356,2021-12-12,California,62512547.0,73131925.0,31859927.0,64.77,158.21,25593384.0,80.63,185.09,274716.0,230225.0,5827.0,0.855
2357,2021-12-13,California,62687689.0,73131685.0,31901262.0,64.88,158.65,25636045.0,80.74,185.09,175142.0,226881.0,5742.0,0.857


In [9]:
min_date = state_df['date'].min()
print(min_date)
max_date = state_df['date'].max()
print(max_date)

2021-01-12 00:00:00
2021-12-14 00:00:00


In [10]:
# 3 dates on which the vaccine numbers are sampled 
# assuming that these 3 dates capture a lot of information about the vaccine days 
vacSample1 = datetime.strptime("15/2/21", "%d/%m/%y")
vacSample2 = datetime.strptime("15/5/21", "%d/%m/%y")
vacSample3 = datetime.strptime("15/8/21", "%d/%m/%y")
vacSample4 = datetime.strptime("15/11/21", "%d/%m/%y")

In [11]:
state_df_sample = state_df[state_df['date'] < vacSample1 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample1 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_1 = state_df_sample_key.add_prefix('S1_')


In [12]:
state_df_sample

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
2047,2021-02-06,California,4137600.0,6963500.0,3389906.0,1.84,10.47,727993.0,8.58,17.62,216968.0,148831.0,3767.0,0.594
2048,2021-02-07,California,4485166.0,6963500.0,3653538.0,2.05,11.35,808832.0,9.25,17.62,347566.0,171402.0,4338.0,0.644
2049,2021-02-08,California,4682862.0,6963500.0,3811715.0,2.15,11.85,847993.0,9.65,17.62,197696.0,175654.0,4446.0,0.672
2050,2021-02-09,California,4784478.0,7385225.0,3880253.0,2.22,12.11,875340.0,9.82,18.69,101616.0,177237.0,4486.0,0.648
2051,2021-02-10,California,4957297.0,7607100.0,3994969.0,2.35,12.55,928615.0,10.11,19.25,172819.0,183712.0,4649.0,0.652
2052,2021-02-11,California,5134864.0,7822300.0,4108216.0,2.51,13.0,990128.0,10.4,19.8,177567.0,201580.0,5102.0,0.656
2053,2021-02-12,California,5341104.0,7963575.0,4222810.0,2.74,13.52,1081040.0,10.69,20.15,206240.0,202925.0,5136.0,0.671
2054,2021-02-13,California,5562553.0,8051475.0,4345018.0,2.98,14.08,1178577.0,11.0,20.38,221449.0,203565.0,5152.0,0.691
2055,2021-02-14,California,5820388.0,8059325.0,4493220.0,3.26,14.73,1287390.0,11.37,20.4,257835.0,190746.0,4828.0,0.722
2056,2021-02-15,California,,,,,,,,,,190404.0,4819.0,


In [13]:
state_df_sample = state_df[state_df['date'] < vacSample2 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample2 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_2 = state_df_sample_key.add_prefix('S2_')



In [14]:
state_df_sample = state_df[state_df['date'] < vacSample3 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample3 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_3 = state_df_sample_key.add_prefix('S3_')





In [15]:
state_df_sample = state_df[state_df['date'] < vacSample4 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample4 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_4 = state_df_sample_key.add_prefix('S4_')






In [16]:
state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_3,
                                                    left_on='S4_state',
                                                    right_on='S3_state')
state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_2,
                                                    left_on='S4_state',
                                                    right_on='S2_state')
state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_1,
                                                    left_on='S4_state',
                                                    right_on='S1_state')

In [17]:
state_df_sample_key_4.columns

Index(['S4_people_vaccinated_per_hundred',
       'S4_people_fully_vaccinated_per_hundred', 'S4_people_vaccinated',
       'S4_people_fully_vaccinated', 'S4_date', 'S4_state',
       'S3_people_vaccinated_per_hundred',
       'S3_people_fully_vaccinated_per_hundred', 'S3_people_vaccinated',
       'S3_people_fully_vaccinated', 'S3_date', 'S3_state',
       'S2_people_vaccinated_per_hundred',
       'S2_people_fully_vaccinated_per_hundred', 'S2_people_vaccinated',
       'S2_people_fully_vaccinated', 'S2_date', 'S2_state',
       'S1_people_vaccinated_per_hundred',
       'S1_people_fully_vaccinated_per_hundred', 'S1_people_vaccinated',
       'S1_people_fully_vaccinated', 'S1_date', 'S1_state'],
      dtype='object')

In [18]:
state_df_sample_key_4 = state_df_sample_key_4.drop(['S4_state', 
                            'S3_state',
                            'S2_state',
                           'S3_date',
                           'S4_date','S2_date','S1_date'], axis=1)

In [19]:
state_df_sample_key_4

Unnamed: 0,S4_people_vaccinated_per_hundred,S4_people_fully_vaccinated_per_hundred,S4_people_vaccinated,S4_people_fully_vaccinated,S3_people_vaccinated_per_hundred,S3_people_fully_vaccinated_per_hundred,S3_people_vaccinated,S3_people_fully_vaccinated,S2_people_vaccinated_per_hundred,S2_people_fully_vaccinated_per_hundred,S2_people_vaccinated,S2_people_fully_vaccinated,S1_people_vaccinated_per_hundred,S1_people_fully_vaccinated_per_hundred,S1_people_vaccinated,S1_people_fully_vaccinated,S1_state
0,77.58,62.63,30655162.0,24748277.0,67.83,54.93,26802699.0,21702126.0,51.05,34.29,20172726.0,13547291.0,9.25,2.05,3653538.0,808832.0,California


# Generalizing for all states

In [20]:
FINAL_DATA = []
# Picking a state 
states = vacDF['location'].unique()

for state_picked in states:
    print(f'State Picked : {state_picked}')

    state_df = vacDF[vacDF['location']==state_picked]
    if state_picked =='Bureau of Prisons' or state_picked == 'Dept of Defense' or state_picked == 'Long Term Care'or state_picked == 'Veterans Health':
        continue
    state_df.sort_values(by='date')
    
    
    min_date = state_df['date'].min()
    print(min_date)
    max_date = state_df['date'].max()
    print(max_date)
    
    # 3 dates on which the vaccine numbers are sampled 
    # assuming that these 3 dates capture a lot of information about the vaccine days 
    vacSample1 = datetime.strptime("15/2/21", "%d/%m/%y")
    vacSample2 = datetime.strptime("15/5/21", "%d/%m/%y")
    vacSample3 = datetime.strptime("15/8/21", "%d/%m/%y")
    vacSample4 = datetime.strptime("15/11/21", "%d/%m/%y")
    
    state_df_sample = state_df[state_df['date'] < vacSample1 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample1 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_1 = state_df_sample_key.add_prefix('S1_')
    
    state_df_sample = state_df[state_df['date'] < vacSample2 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample2 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_2 = state_df_sample_key.add_prefix('S2_')

    state_df_sample = state_df[state_df['date'] < vacSample3 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample3 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_3 = state_df_sample_key.add_prefix('S3_')



    state_df_sample = state_df[state_df['date'] < vacSample4 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample4 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_4 = state_df_sample_key.add_prefix('S4_')





    state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_3,
                                                        left_on='S4_state',
                                                        right_on='S3_state')
    state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_2,
                                                        left_on='S4_state',
                                                        right_on='S2_state')
    state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_1,
                                                        left_on='S4_state',
                                                        right_on='S1_state')




    state_df_sample_key_4 = state_df_sample_key_4.drop(['S4_state', 
                                'S3_state',
                                'S2_state',
                               'S3_date',
                               'S4_date','S2_date','S1_date'], axis=1)
    
    
    FINAL_DATA.append(state_df_sample_key_4)


State Picked : Alabama
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Alaska
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : American Samoa
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Arizona
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Arkansas
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Bureau of Prisons
State Picked : California
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Colorado
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Connecticut
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Delaware
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Dept of Defense
State Picked : District of Columbia
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Federated States of Micronesia
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Florida
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Georgia
2021-01-12 00:00:00
2021-12-14 00:00:00
State Picked : Guam
2021-01-12 00:00:00
2021-12-14 00:00:00
State

In [21]:
OUTPUT = pd.concat(FINAL_DATA)

In [22]:
OUTPUT.to_csv('../outputs/state_vac.csv')