In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer
import urllib.request
import warnings 
warnings.simplefilter('ignore')
import plotly.express as px
%matplotlib inline

In [2]:
# Display format
pd.set_option('display.max_columns', None)
#pd.set_option('display.float_format', lambda x: '%.4f' % x)
DATA_DIR = r'/Users/yotamdery/Old_Desktop/git/SEIR_model_COVID-main/Data'

In [3]:
# Accessing the API - ״מתחסנים לפי ישוב״
url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=12c9045c-1bf4-478a-a9e1-1e876cc2e182&limit=1000000'
fileobj = urllib.request.urlopen(url)
df = pd.read_json(fileobj)
data = df.loc['records']['result']

In [4]:
# Creating a DataFrame out of the API results:
vaccinations_per_town = pd.DataFrame(data)
vaccinations_per_town = vaccinations_per_town.loc[vaccinations_per_town['Date'] <= '2021-10-25']
vaccinations_per_town = vaccinations_per_town[vaccinations_per_town['CityName'] != 'לא ידוע']
vaccinations_per_town.sort_values(by= ['CityCode', 'Date'], inplace= True)
vaccinations_per_town['Date'] = pd.to_datetime(vaccinations_per_town['Date'])

* Functions


In [5]:
# Reading the sick file
def reading_sick_file(df):
    def filling_missing_values(temp_df):     # Function of interpolation to use
        temp_df.iloc[0] = temp_df.iloc[0].fillna(0.0)      # Filling with zeros only the NaN values
        for col in temp_df.columns[3:]:
            temp_df = temp_df.copy()
            temp_df[col] = temp_df[col].astype(float)     # Converting types
            # Creating random array
            arr = np.random.randint(low= 0, high= 15, size= sum(temp_df[col].isnull()))
            arr = np.sort(arr)
            # Getting the indexes of the rows of the column where the value is 0
            idx = temp_df[temp_df.loc[ : , col].isna()][col].index
            # Filling the NaN values in the correct places with the array
            temp_df.loc[idx, col] = arr
        return temp_df

    df = df.iloc[ : , 1:13]          # Choosing relevant columns
    sick_filtered = df[~(df.iloc[:, 3:] == '0.0').all(axis=1)]       # Selecting the relevant dates
    sick_filtered.replace(to_replace= '<15', value= '0.0', inplace= True)
    sick_filtered.replace(to_replace= '0.0', value= np.nan, inplace= True)
    sick_filtered = sick_filtered.groupby(by= 'CityCode', as_index= False).apply(filling_missing_values)
    sick_filtered['CityCode'] = sick_filtered['CityCode'].astype(int)

    return sick_filtered

In [6]:
### Importing and cleaning of Taz_Yeshuv_250_mapping
# reading Taz_Yeshuv_250_mapping
def reading_town_code_to_taz_mapper():
    town_code_to_taz_mapper =  pd.read_excel(r'/Users/yotamdery/Old_Desktop/git/health_data_mapping/Taz_Yeshuv_250_mapping.xlsx',engine='openpyxl', usecols= 'A, E, P')
    town_code_to_taz_mapper.columns = ['taz_id','town_code','population']
    town_code_to_taz_mapper = town_code_to_taz_mapper.iloc[4:2633]
    return town_code_to_taz_mapper

In [7]:
def normalized_pop_per_taz(df):
    def normalized_inner(temp):
        temp_copy = temp.copy()
        population_sum = np.sum(temp_copy['population'])           # Getting the sum of the column to normalize accordingly
        temp_copy['population'] = temp_copy['population'] / population_sum   # performing the normalization
        temp_copy = temp_copy.loc[temp_copy['population'] != 0.0]
        return temp_copy
    
    df = df.groupby(by= 'town_code', as_index= False).apply(normalized_inner)
    df['town_code'] = df['town_code'].astype(float)
    df['town_code'] = df['town_code'].astype(int)
    return df

In [8]:
# filtering the df
# def filtering_rows(df):
#     df_exclude_90plus = df.iloc[: , :-1]    # a DF without the first_dose_90+ column
#     sick_filtered = df[((df_exclude_90plus != '<15') & (df_exclude_90plus != '0.0')).all(axis= 1)]
#     sick_filtered['first_dose_90+'] = sick_filtered['first_dose_90+'].replace(to_replace= '<15', value= '0')  # in cases that a lot of values are '<15'. we sum this column anyway
#     sick_filtered.sort_values(by= ['CityCode','Date'], inplace= True)
#     return sick_filtered

In [9]:
# Type convertion of fields
# def types_convertion(df):
#     df['Date'] = pd.to_datetime(df['Date'])
#     for c in df.columns[1:] :
#         if c not in ['Date']:    # Have to convert to float first because of the original format...
#             df[c] = df[c].astype(float)
#             # Converting all columns but the date column
#             df[c] = df[c].astype(int)
#     return df

In [10]:
# Merging 80-89 and 90+ age groups:
def merge_columns_to_80plus(df):
    df['first_dose_80+'] = df['first_dose_80-89'] + df['first_dose_90+']
    df.drop(labels= ['first_dose_80-89', 'first_dose_90+'], axis= 1, inplace= True)
    return df

<b> spreading the vaccinated individuals over the Tazs:

In [11]:
### multiplying by the population ratio:
def multiply_by_pop_ratio(df):
    for column in df.columns[3:11] :
        df[column + "_multiplied"] = df[column] * df['population']
    df = df.iloc[ : , np.r_[0:3, 11, 14:22]]
    return df   

In [12]:
# A mapper - taz for 250 regions 
def reading_taz_250_mapping():
    taz_250_mapping = pd.read_excel(r'/Users/yotamdery/Old_Desktop/git/health_data_mapping/Taz_Yeshuv_250_mapping.xlsx',engine='openpyxl', usecols= "A,P,AH")
    taz_250_mapping = taz_250_mapping.rename(columns= {'Unnamed: 0' : 'taz_id', 'Unnamed: 15' : 'population', 'Unnamed: 33' : '250_regions'})
    taz_250_mapping = taz_250_mapping.loc[3:2633]
    return taz_250_mapping

In [13]:
# Mappers - Total population for 250 regions and Taz for 250_regions
def create_mappers():
    taz_to_250_mapper = reading_taz_250_mapping()[['taz_id','250_regions']]
    pop_for_250_mapper = reading_taz_250_mapping()[['population','250_regions']]
    pop_for_250_mapper = pop_for_250_mapper.groupby('250_regions', as_index= False).sum()
    return taz_to_250_mapper, pop_for_250_mapper


In [14]:
def reading_250_30_mapping():
    mapping_30_250 = pd.read_excel(DATA_DIR + '/division_choice/30/cell250_to_cell30.xlsx', engine='openpyxl')
    return mapping_30_250

In [15]:
# Receives the recent DF and a mapper from 250 to 30
def aggregate_to_30_level(vaccinated_with_pop_250, mapping_30_250):
    vaccinated_30_merged = vaccinated_with_pop_250.merge(mapping_30_250, left_on= '250_regions', right_on= 'cell_id')
    vaccinated_30_no_pop = vaccinated_30_merged.groupby(by= ['30_county','Date'], as_index= False)[vaccinated_30_merged.columns[2:10]].sum()      # Aggregate health data to county level

    vaccinated_30_merged_no_duplicates = vaccinated_30_merged[~vaccinated_30_merged.duplicated(subset= ['250_regions','population'])]
    pop_30_mapper = vaccinated_30_merged_no_duplicates.groupby('30_county', as_index= False)['population'].sum()                                                # Aggregate the *population* to county level
    
    vaccinated_30_with_pop = vaccinated_30_no_pop.merge(pop_30_mapper, on= '30_county')
    
    vaccinated_30_with_pop['30_county'] = vaccinated_30_with_pop['30_county'].astype(str)
    return vaccinated_30_with_pop
    

In [16]:
# Calculating the absolute values of vaccinations - creating new columns for it (with 'absolut' prefix)
def calculate_difference_vaccinations(df):
    df_copy = df.copy()
    # If it is the first row, take it as it is, don't apply the difference function
    first_row = df_copy.iloc[0].to_frame().T
    first_row['absolute_' + first_row.columns[2:10]] = first_row[first_row.columns[2:10]]    # Creating a DF of one row with the same columns as the big DF
    
    # Applying the difference function on the whole DF
    df_copy['absolute_' + df_copy.columns[2:10]] = df_copy[df_copy.columns[2:10]].diff(periods= 1)
    
    # Removing the first row (the start of documentation date)
    whole_df_no_null = df_copy[~df_copy[df_copy.columns[-1]].isna()]
    
    final_df = pd.concat([first_row, whole_df_no_null], axis= 0)
    return final_df

In [17]:
# Main function - Receiving only the health df
def main(df):
    df = reading_sick_file(df)
    #sick_filtered = filtering_rows(df)     # Removing categorical values (rows with '0' and '<15')
    #df_types_converted = types_convertion(sick_filtered)          # Converting the types
    df = merge_columns_to_80plus(df)       # Merging 80-89 and 90+ age groups
    
    # Sanity check:
    print("Total number of vaccinations per age:")
    print (sum((df[df['Date'] == df['Date'].max()][df.columns[3:]].sum(axis= 1))))

    ### Merging the health file with taz to yeshuv mapping - To get the TAZ for each town code:
    town_code_to_taz_mapper = reading_town_code_to_taz_mapper()                 # Reading the Taz_to_yeshuv file
    town_code_to_taz_mapper = normalized_pop_per_taz(town_code_to_taz_mapper)   # Normalizing the population column    
    vaccination_taz_merged = df.merge(town_code_to_taz_mapper, how= 'inner', left_on= 'CityCode', right_on= 'town_code')

    ## Spreading over the Tazs:
    normed_vaccinated = multiply_by_pop_ratio(vaccination_taz_merged)     # Multiplying each column by the population ratio
    #return normed_vaccinated
    grouped_vaccinated_and_mapping = normed_vaccinated.groupby(by= ['taz_id','Date']).sum().reset_index()  # grouping by taz and date
    grouped_vaccinated_and_mapping.drop("CityCode", axis= 1, inplace= True)    # Not relevant anymore

    # Sanity check:
    print("Total number of vaccinations per age:")
    print (sum((grouped_vaccinated_and_mapping[grouped_vaccinated_and_mapping['Date'] == 
                                               grouped_vaccinated_and_mapping['Date'].max()]
                                              [grouped_vaccinated_and_mapping.columns[2:]].sum(axis= 1))))
    
    taz_250_mapping, pop_for_250_mapper = create_mappers()      # Mappers - Total population for 250 regions and Taz for 250_regions
    
    vaccinated_250_merged = grouped_vaccinated_and_mapping.merge(taz_250_mapping)
    vaccinated_250 = vaccinated_250_merged.groupby(by= ['250_regions', 'Date'], 
                                                   as_index= False).sum()    # Aggregating to 250 regions level
    vaccinated_with_pop_250 = vaccinated_250.merge(pop_for_250_mapper)       # involving the population for each region
    
    # Sanity check:
    print("Total number of vaccinations per age:")
    print (sum((vaccinated_with_pop_250[vaccinated_with_pop_250['Date'] == vaccinated_with_pop_250['Date'].max()][vaccinated_with_pop_250.columns[2:-1]].sum(axis= 1))))
    
    mapping_250_30 = reading_250_30_mapping()                               # reading the 250 to 30 counties file
    vaccinated_30_with_pop = aggregate_to_30_level(vaccinated_with_pop_250, mapping_250_30)        # Aggregating the data to 30 counties level
    absolute_vaccination_amount = vaccinated_30_with_pop.groupby(by= ['30_county'], as_index= False ) \
                                                                .apply(calculate_difference_vaccinations)    # Calculating the actual amounts of vaccinations. grouping by 250_regions to avoid negative values
    return absolute_vaccination_amount 
          

In [18]:
# Taking only the relevant columns into account (the first dose columns)
df_first_dose = main(vaccinations_per_town.copy())
print("\nFinished creating first dose\n")

#df_second_dose = main(vaccninations_per_city.copy().drop('accumulated_vaccination_first_dose',axis= 1))
#print("\nFinished creating second dose df\n")

Total number of vaccinations per age:
5585492.0
Total number of vaccinations per age:
5395719.999999997
Total number of vaccinations per age:
5395720.000000001

Finished creating first dose



In [19]:
df_first_dose[df_first_dose['30_county'] == '11_betshemesh']

Unnamed: 0,Unnamed: 1,30_county,Date,first_dose_0-19_multiplied,first_dose_20-29_multiplied,first_dose_30-39_multiplied,first_dose_40-49_multiplied,first_dose_50-59_multiplied,first_dose_60-69_multiplied,first_dose_70-79_multiplied,first_dose_80+_multiplied,population,absolute_first_dose_0-19_multiplied,absolute_first_dose_20-29_multiplied,absolute_first_dose_30-39_multiplied,absolute_first_dose_40-49_multiplied,absolute_first_dose_50-59_multiplied,absolute_first_dose_60-69_multiplied,absolute_first_dose_70-79_multiplied,absolute_first_dose_80+_multiplied
1,2480,11_betshemesh,2020-12-20,0,0,0,0,17,0,0,0,114768,0,0,0,0,17,0,0,0
1,2481,11_betshemesh,2020-12-21,10,14,21,30,30,27,7,19,114768,10,14,21,30,13,27,7,19
1,2482,11_betshemesh,2020-12-22,10,25,43,70,75,75,36,19,114768,0,11,22,40,45,48,29,0
1,2483,11_betshemesh,2020-12-23,14,66,107,175,164,288,184,68,114768,4,41,64,105,89,213,148,49
1,2484,11_betshemesh,2020-12-24,29,121,218,355,271,580,407,142,114768,15,55,111,180,107,292,223,74
1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2785,11_betshemesh,2021-10-21,10387,12466,8756,7031,5415,3722,2163,1137,114768,23,16,14,5,5,2,0,2
1,2786,11_betshemesh,2021-10-22,10394,12472,8761,7036,5416,3722,2163,1137,114768,7,6,5,5,1,0,0,0
1,2787,11_betshemesh,2021-10-23,10394,12472,8761,7036,5416,3722,2163,1137,114768,0,0,0,0,0,0,0,0
1,2788,11_betshemesh,2021-10-24,10406,12487,8766,7042,5419,3722,2164,1137,114768,12,15,5,6,3,0,1,0


In [20]:
# Choosing the relevant columns
result = df_first_dose.iloc[ : , np.r_[0, 1, 10:19]]

In [21]:
# Removing the 'multiplied' word from the columns titles
new_columns_name_list = []
for column in list(result.columns):
    list_of_elements = column.split('_')
    if len(list_of_elements) > 3:
        list_of_elements.pop()
    new_columns_name_list.append('_'.join(list_of_elements))
result.columns = new_columns_name_list

<b> W'd like to be aligned with the age groups of the current model:

In [22]:
# Splitting the amounts of 'absolute_first_dose_0-19' column assuming uniform distribution
result['absolute_first_dose_0-4'], result['absolute_first_dose_5-9'] = result['absolute_first_dose_0-19'] * 0.10, result['absolute_first_dose_0-19'] * 0.15
result['absolute_first_dose_10-19'] = result['absolute_first_dose_0-19'] * 0.75
result.drop('absolute_first_dose_0-19', axis= 1, inplace= True)

# Re-ordering the DF
all_columns = result.columns.tolist()
new_ordered_columns = all_columns[0:3] + all_columns[-3:] + all_columns[3:-3]
result = result[new_ordered_columns]

# Summing the 2 columns to get 'absolute_first_dose_70+' column
result['absolute_first_dose_70+'] = result['absolute_first_dose_70-79'] + result['absolute_first_dose_80+']
result.drop(columns= ['absolute_first_dose_70-79', 'absolute_first_dose_80+'], axis= 1, inplace= True)

<b> Adding the 'county_id' to the counties to be able to involve it in the SEIR model:

In [23]:
# Reading
county_string_to_id = pd.read_excel(DATA_DIR + '/division_choice/30/county_int_2name_county_string.xlsx', engine='openpyxl', usecols= "A,C")
county_string_to_id['county_string_id'] = county_string_to_id['county_string_id'].astype(str)

In [24]:
# Merging
result_with_county_id = result.merge(county_string_to_id, how= 'inner', left_on= '30_county', right_on= 'county_string_id')

In [25]:
# Reordering
all_columns = result_with_county_id.columns.tolist()
new_ordered_columns = [all_columns[1]] + all_columns[-2:] + [all_columns[2]] + all_columns[3:-2]
result_with_county_id = result_with_county_id[new_ordered_columns]

In [26]:
# Sorting
result_with_county_id = result_with_county_id.sort_values(by= ['Date', 'county_id']).reset_index(drop= True)

In [27]:
# Converting numbers to floats
for column in result_with_county_id.columns[3:]:
    result_with_county_id[column] = result_with_county_id[column].astype(float)

<b> Creating a proportion DF - each value will be the proportion of vaccinated individuals
    per county and age-group. The sample space will be the population for each county

Create a dictionary of {(county_id, county_string_id) : population}

In [28]:
df = result_with_county_id[['county_id', 'county_string_id', 'population']]
unique_df = df[~df.duplicated()].sort_values(by= 'county_id')
dict_region_population = unique_df.set_index(['county_id', 'county_string_id']).T.to_dict('list')

In [29]:
dict_region_population

{(1100, '11'): [743570.0],
 (1101, '11_haredi'): [253305.0],
 (1103, '11_betshemesh'): [114768.0],
 (2100, '21'): [132334.0],
 (2200, '22_jewish'): [93746.0],
 (2202, '22_arab'): [22722.0],
 (2300, '23_jewish'): [292415.0],
 (2302, '23_arab'): [151924.0],
 (2400, '24_jewish'): [240564.0],
 (2402, '24_arab'): [341354.0],
 (2900, '29'): [49663.0],
 (3100, '31_jewish'): [563123.0],
 (3102, '31_arab'): [34058.0],
 (3200, '32_jewish'): [301292.0],
 (3202, '32_arab'): [109848.0],
 (4100, '41_jewish'): [393983.0],
 (4102, '41_arab'): [81875.0],
 (4200, '42_secular'): [650007.0],
 (4201, '42_haredi'): [47280.0],
 (4202, '42_arab'): [26229.0],
 (4300, '43'): [308702.0],
 (4400, '44'): [610942.0],
 (5100, '51'): [783008.0],
 (5101, '51_bb'): [201611.0],
 (5103, '51_tlv'): [443903.0],
 (6100, '61'): [498131.0],
 (6200, '62_jewish'): [457315.0],
 (6202, '62_arab'): [187381.0],
 (7100, '71'): [255900.0],
 (7101, '71_haredi'): [148038.0]}

Filling zeroes for counties that don't have vaccinated individuals in a specific date:

In [30]:
def filling_zeros(df, **kwargs):
    set_from_dict = set(dict_region_population.keys())       # Static set of the counties
    current_set_from_df = set(zip(df['county_id'], df['county_string_id']))  # Set of the current counties (differs from each date)
    set_difference = sorted(list(set_from_dict - current_set_from_df))    # getting the counties that don't exist in the current date
    
    if set_difference == []:    # If all counties are there - exit the function, do nothing
        return df
    
    # Bulding the new df to concat - the non-exist counties df:
    data = []      # empty list to end up as list of lists to create the df
    current_date = df['Date'].iloc[0]
    for element in set_difference:     # Iterating over the difference list - counties that dont exists in the specific date:
        data.append([current_date, element[0], element[1], dict_region_population[element][0], 0, 0, 0, 0, 0, 0, 0, 0, 0])
    current_df = pd.DataFrame(data, columns= df.columns)
    
    final_df = pd.concat([df, current_df], axis= 0)     # Stacking the df one ontop of the other
    return final_df
    

In [31]:
# Applying the function above
df_filled_zeros = result_with_county_id.groupby(by= ['Date'], as_index= False).apply(filling_zeros, args= dict_region_population).reset_index()
#df_filled_zeros = df_filled_zeros.drop(labels= ['level_0', 'level_1'], axis= 1).sort_values(by= ['Date', 'county_id'])

Reading the 'population_per_county_age-group' file to get the population for each intersection of county and age-group:

In [32]:
population_per_county_age_group = pd.read_csv(DATA_DIR + '/division_choice/30/population_per_county_age-group.csv')

In [33]:
population_per_county_age_group.head()

Unnamed: 0,county_id,population_0-4,population_5-9,population_10-19,population_20-29,population_30-39,population_40-49,population_50-59,population_60-69,population_70+
0,1100,91350,85660,148805,125617,99834,81115,63261,51140,50298
1,1101,39848,32811,57822,45849,29948,23961,18944,15843,17449
2,1103,21453,19019,27366,17534,14239,10275,7471,4943,4350
3,2100,14346,13429,23977,22271,16873,15618,14006,11541,9864
4,2200,9795,8968,16807,15434,11712,11236,10151,8974,7444


* Merging it with the most updated data frame; merging by the county_id field:

In [34]:
full_pop_df = df_filled_zeros.merge(population_per_county_age_group, how= 'inner', left_on= 'county_id',
                                   right_on= 'county_id').drop(columns= ['population'], axis= 1)
full_pop_df.sort_values(by= ['county_id', 'Date'], inplace= True)
full_pop_df.drop('index', axis= 1, inplace= True)

* Involving the morbidity data

In [35]:
morbidity_df = pd.read_excel(DATA_DIR + '/calibration/smoothed_cases_by_age_county.xlsx', engine= 'openpyxl', index_col= 0)

In [36]:
# Shifting the vaccinations 3 days backwards to match the smoothed morbidity data in a manner of timing
full_pop_df_shifted = full_pop_df.set_index("Date", drop= True).shift(periods= -3, freq= "D")
full_pop_df_shifted.reset_index(drop= False, inplace= True)


In [37]:
# Merging the vaccinations data with the morbidity data
vaccinations_morbidity_df = full_pop_df_shifted.merge(morbidity_df, how= 'inner', left_on= ['county_string_id', 'Date'],
                                                      right_on= ['30_county', 'date'])

In [38]:
vaccinations_morbidity_df

Unnamed: 0,Date,county_id,county_string_id,absolute_first_dose_0-4,absolute_first_dose_5-9,absolute_first_dose_10-19,absolute_first_dose_20-29,absolute_first_dose_30-39,absolute_first_dose_40-49,absolute_first_dose_50-59,absolute_first_dose_60-69,absolute_first_dose_70+,population_0-4,population_5-9,population_10-19,population_20-29,population_30-39,population_40-49,population_50-59,population_60-69,population_70+,date,30_county,0-4_absolute_cases,5-9_absolute_cases,10-19_absolute_cases,20-29_absolute_cases,30-39_absolute_cases,40-49_absolute_cases,50-59_absolute_cases,60-69_absolute_cases,70+_absolute_cases
0,2020-12-17,1100,11,0.000000,0.000000,0.000000,28.091385,67.707441,71.308901,107.833572,158.932255,105.882913,91350,85660,148805,125617,99834,81115,63261,51140,50298,2020-12-17,11,32.411892,40.056201,97.166106,99.936453,81.859323,80.041696,59.981701,40.582660,35.823781
1,2020-12-18,1100,11,0.500000,0.750000,3.750000,79.350944,195.650540,222.454450,233.191345,758.819910,754.779429,91350,85660,148805,125617,99834,81115,63261,51140,50298,2020-12-18,11,32.557289,40.235890,97.601987,100.384762,82.226540,80.400759,60.250775,40.764712,35.984485
2,2020-12-19,1100,11,2.463686,3.695528,18.477642,145.121229,264.854762,287.544297,277.677074,1000.299173,1366.926903,91350,85660,148805,125617,99834,81115,63261,51140,50298,2020-12-19,11,33.266690,41.112602,99.728665,102.572075,84.018198,82.152634,61.563597,41.652946,36.768561
3,2020-12-20,1100,11,0.676234,1.014350,5.071752,200.028280,281.904104,300.476135,321.032893,1232.112569,1557.705111,91350,85660,148805,125617,99834,81115,63261,51140,50298,2020-12-20,11,36.205183,44.744135,108.537837,111.832697,88.470947,81.462422,59.245398,42.284768,33.570835
4,2020-12-21,1100,11,1.808276,2.712414,13.562068,192.267274,261.193117,309.562881,378.288464,1344.256606,1677.690044,91350,85660,148805,125617,99834,81115,63261,51140,50298,2020-12-21,11,36.872058,45.568293,110.537032,113.892582,90.100524,82.962907,60.336659,43.063625,34.189188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9295,2021-10-18,7101,71_haredi,3.400000,5.100000,25.500000,27.000000,12.162205,10.000000,6.000000,1.000000,0.000000,30168,30022,35857,19886,18481,11554,5201,2621,2175,2021-10-18,71_haredi,2.896248,4.947357,6.527655,2.050741,2.770751,1.915739,0.803582,0.507864,0.482149
9296,2021-10-19,7101,71_haredi,1.132441,1.698662,8.493308,20.000000,18.000000,4.162205,3.000000,1.000000,0.000000,30168,30022,35857,19886,18481,11554,5201,2621,2175,2021-10-19,71_haredi,2.862653,4.889970,6.451938,2.026954,2.738611,1.893518,0.794261,0.501973,0.476557
9297,2021-10-20,7101,71_haredi,0.016221,0.024331,0.121654,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,30168,30022,35857,19886,18481,11554,5201,2621,2175,2021-10-20,71_haredi,2.676096,4.571294,6.031470,1.894859,2.560138,1.770119,0.742499,0.469260,0.445500
9298,2021-10-21,7101,71_haredi,2.516221,3.774331,18.871654,28.000000,12.000000,13.000000,2.000000,0.000000,1.000000,30168,30022,35857,19886,18481,11554,5201,2621,2175,2021-10-21,71_haredi,2.602896,4.446254,5.866488,1.843028,2.490110,1.721700,0.722190,0.456424,0.433314


getting the proportion of vaccinated individuals: <br>
* grouping by the county id, then create a new column to keep track on the non_vaccinated_population (by substraction of the amount of vaccinated individuals). The non_vaccinated_population are the individuals who are eligible to be vaccinated!

In [39]:
def get_proportion_of_vaccinated_people(df):
    df_copy = df.copy()
    # Creating new columns of the non-vaccinated population for each age-group:
    for column in df_copy.columns[12:21]:
        age_group_ending = column.split('_').pop()
        df_copy['eligible_for_vaccine_' + age_group_ending] = np.nan

    for i in range(df_copy.shape[0]):
        if i == 0:
            df_copy.iloc[i, 32:] = df_copy.iloc[i, 12:21].to_numpy()   # update the first value with the original population size
        else:
            df_copy.iloc[i, 32:] = ((df_copy.iloc[i-1, 32: ].to_numpy()) - (df.iloc[i-1, 3:12].to_numpy())) \
                                   - ((df.iloc[i-1, 23:33].to_numpy()))
            # update the non vaccinated population

    # Creating proportion column - dividing by the population that haven't been vaccinated so far for each age-group
    for column in df_copy.columns[3:12]:
        age_group_ending = column.split('_').pop()
        first_dose_age_group_ending = '_'.join(column.split('_')[1:])
        df_copy['prop_' + first_dose_age_group_ending] = (df_copy[column] / df_copy['eligible_for_vaccine_' + age_group_ending])

    return df_copy

In [40]:
# Applying the function above
proportion_df = vaccinations_morbidity_df.groupby(by= ['county_id'], as_index= False).apply(get_proportion_of_vaccinated_people)

In [41]:
proportion_df.drop(columns= ['date', '30_county'], axis= 1, inplace= True)

In [42]:
proportion_df[(proportion_df.iloc[: , 3:] < 0).any(1)]['county_id'].unique()

array([], dtype=int64)

In [43]:
proportion_df[(proportion_df.iloc[: , 3:] < 0).any(1)]

Unnamed: 0,Unnamed: 1,Date,county_id,county_string_id,absolute_first_dose_0-4,absolute_first_dose_5-9,absolute_first_dose_10-19,absolute_first_dose_20-29,absolute_first_dose_30-39,absolute_first_dose_40-49,absolute_first_dose_50-59,absolute_first_dose_60-69,absolute_first_dose_70+,population_0-4,population_5-9,population_10-19,population_20-29,population_30-39,population_40-49,population_50-59,population_60-69,population_70+,0-4_absolute_cases,5-9_absolute_cases,10-19_absolute_cases,20-29_absolute_cases,30-39_absolute_cases,40-49_absolute_cases,50-59_absolute_cases,60-69_absolute_cases,70+_absolute_cases,eligible_for_vaccine_0-4,eligible_for_vaccine_5-9,eligible_for_vaccine_10-19,eligible_for_vaccine_20-29,eligible_for_vaccine_30-39,eligible_for_vaccine_40-49,eligible_for_vaccine_50-59,eligible_for_vaccine_60-69,eligible_for_vaccine_70+,prop_first_dose_0-4,prop_first_dose_5-9,prop_first_dose_10-19,prop_first_dose_20-29,prop_first_dose_30-39,prop_first_dose_40-49,prop_first_dose_50-59,prop_first_dose_60-69,prop_first_dose_70+


In [44]:
proportion_df.columns

Index(['Date', 'county_id', 'county_string_id', 'absolute_first_dose_0-4',
       'absolute_first_dose_5-9', 'absolute_first_dose_10-19',
       'absolute_first_dose_20-29', 'absolute_first_dose_30-39',
       'absolute_first_dose_40-49', 'absolute_first_dose_50-59',
       'absolute_first_dose_60-69', 'absolute_first_dose_70+',
       'population_0-4', 'population_5-9', 'population_10-19',
       'population_20-29', 'population_30-39', 'population_40-49',
       'population_50-59', 'population_60-69', 'population_70+',
       '0-4_absolute_cases', '5-9_absolute_cases', '10-19_absolute_cases',
       '20-29_absolute_cases', '30-39_absolute_cases', '40-49_absolute_cases',
       '50-59_absolute_cases', '60-69_absolute_cases', '70+_absolute_cases',
       'eligible_for_vaccine_0-4', 'eligible_for_vaccine_5-9',
       'eligible_for_vaccine_10-19', 'eligible_for_vaccine_20-29',
       'eligible_for_vaccine_30-39', 'eligible_for_vaccine_40-49',
       'eligible_for_vaccine_50-59', 'eligible

In [45]:
len(proportion_df.columns)

48

In [47]:
# Reordering the columns
all_columns = proportion_df.columns.tolist()
new_ordered_columns = all_columns[0:3] + all_columns[39:] + all_columns[3:39]
proportion_df = proportion_df[new_ordered_columns]

In [48]:
proportion_df.to_excel(DATA_DIR + '/Vaccinations/First_dose_vaccination_by_30area_age.xlsx', index= False)

In [63]:
proportion_df.tail()

Unnamed: 0,Unnamed: 1,Date,county_id,county_string_id,prop_first_dose_5-9,prop_first_dose_10-19,prop_first_dose_20-29,prop_first_dose_30-39,prop_first_dose_40-49,prop_first_dose_50-59,prop_first_dose_60-69,prop_first_dose_70+,absolute_first_dose_0-4,absolute_first_dose_5-9,absolute_first_dose_10-19,absolute_first_dose_20-29,absolute_first_dose_30-39,absolute_first_dose_40-49,absolute_first_dose_50-59,absolute_first_dose_60-69,absolute_first_dose_70+,population_0-4,population_5-9,population_10-19,population_20-29,population_30-39,population_40-49,population_50-59,population_60-69,population_70+,0-4_absolute_cases,5-9_absolute_cases,10-19_absolute_cases,20-29_absolute_cases,30-39_absolute_cases,40-49_absolute_cases,50-59_absolute_cases,60-69_absolute_cases,70+_absolute_cases,eligible_for_vaccine_0-4,eligible_for_vaccine_5-9,eligible_for_vaccine_10-19,eligible_for_vaccine_20-29,eligible_for_vaccine_30-39,eligible_for_vaccine_40-49,eligible_for_vaccine_50-59,eligible_for_vaccine_60-69,eligible_for_vaccine_70+,prop_first_dose_0-4
29,9295,2021-10-18,7101,71_haredi,0.0002024702,0.00133,0.009604,0.002541,0.010539,0.395498,0.031031,0.0,3.4,5.1,25.5,27.0,12.162205,10.0,6.0,1.0,0.0,30168,30022,35857,19886,18481,11554,5201,2621,2175,2.896248,4.947357,6.527655,2.050741,2.770751,1.915739,0.803582,0.507864,0.482149,26786.088076,25188.896707,19174.773568,2811.377327,4786.421612,948.898805,15.170751,32.226308,80.030549,0.0001269316
29,9296,2021-10-19,7101,71_haredi,6.746383e-05,0.000444,0.007188,0.003772,0.004442,0.358544,0.032554,0.0,1.132441,1.698662,8.493308,20.0,18.0,4.162205,3.0,1.0,0.0,30168,30022,35857,19886,18481,11554,5201,2621,2175,2.862653,4.88997,6.451938,2.026954,2.738611,1.893518,0.794261,0.501973,0.476557,26779.791828,25178.84935,19142.745912,2782.326586,4771.488656,936.983066,8.367169,30.718445,79.5484,4.228715e-05
29,9297,2021-10-20,7101,71_haredi,9.665706e-07,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.016221,0.024331,0.121654,0.0,0.0,0.0,0.0,0.0,0.0,30168,30022,35857,19886,18481,11554,5201,2621,2175,2.676096,4.571294,6.03147,1.894859,2.560138,1.770119,0.742499,0.46926,0.4455,26775.796733,25172.260719,19127.800667,2760.299632,4750.750044,930.927343,4.572908,29.216472,79.071843,6.0579e-07
29,9298,2021-10-21,7101,71_haredi,0.0001499675,0.000987,0.010151,0.002527,0.013991,0.522138,0.0,0.012718,2.516221,3.774331,18.871654,28.0,12.0,13.0,2.0,0.0,1.0,30168,30022,35857,19886,18481,11554,5201,2621,2175,2.602896,4.446254,5.866488,1.843028,2.49011,1.7217,0.72219,0.456424,0.433314,26773.104416,25167.665094,19121.647544,2758.404773,4748.189906,929.157224,3.830408,28.747212,78.626344,9.398314e-05
29,9299,2021-10-22,7101,71_haredi,0.0001728973,0.001139,0.008429,0.003591,0.008749,2.707046,0.0,0.0,2.9,4.35,21.75,23.0,17.0,8.0,3.0,0.0,0.0,30168,30022,35857,19886,18481,11554,5201,2621,2175,2.802161,4.786638,6.315599,1.984121,2.680741,1.853505,0.777477,0.491365,0.466486,26767.9853,25159.444509,19096.909402,2728.561746,4733.699797,914.435524,1.108219,28.290788,77.19303,0.0001083384


In [64]:
proportion_df['Date'].nunique()

310