In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os

In [2]:
# Explore the "covid_infected_deaths" table
total_covid_infected_deaths_perDistrict = pd.read_stata("../covid_data/covid/covid_infected_deaths.dta")
total_covid_infected_deaths_perDistrict.head()

Unnamed: 0,lgd_state_id,lgd_district_id,date,lgd_state_name,lgd_district_name,total_cases,total_deaths
0,35,632,2020-01-30,andaman and nicobar islands,north and middle andaman,0.0,0.0
1,35,632,2020-02-02,andaman and nicobar islands,north and middle andaman,0.0,0.0
2,35,632,2020-02-03,andaman and nicobar islands,north and middle andaman,0.0,0.0
3,35,632,2020-03-02,andaman and nicobar islands,north and middle andaman,0.0,0.0
4,35,632,2020-03-03,andaman and nicobar islands,north and middle andaman,0.0,0.0


In [3]:
# Explore the "covid_infected_deaths" table
total_covid_infected_deaths_perDistrict["year"] = total_covid_infected_deaths_perDistrict["date"].dt.year
total_covid_infected_deaths_perDistrict["month"] = total_covid_infected_deaths_perDistrict["date"].dt.month
total_covid_infected_deaths_perDistrict.drop(["date"], axis=1, inplace=True)

# Explore the "total_covid_infected_deaths_perDistrict" table
total_covid_infected_deaths_perDistrict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439416 entries, 0 to 439415
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   lgd_state_id       439416 non-null  object 
 1   lgd_district_id    439416 non-null  object 
 2   lgd_state_name     439416 non-null  object 
 3   lgd_district_name  439416 non-null  object 
 4   total_cases        392238 non-null  float64
 5   total_deaths       392238 non-null  float64
 6   year               439416 non-null  int32  
 7   month              439416 non-null  int32  
dtypes: float64(2), int32(2), object(4)
memory usage: 23.5+ MB


In [4]:
# Get the monthly cumulative cases and deaths in district level
monthly_cumulative = total_covid_infected_deaths_perDistrict.groupby(["lgd_district_id", "year", "month"]).agg({
    "total_cases": "max",
    "total_deaths": "max"
}).reset_index()

# Get the monthly new cases and deaths in district level
monthly_new_cases_deaths = monthly_cumulative.groupby(["lgd_district_id"]).diff().fillna(0)

# Merge the above data
monthly_cumulative[["new_cases", "new_deaths"]] = monthly_new_cases_deaths[["total_cases", "total_deaths"]]

# Clean the "monthly_cumulative" table
monthly_cumulative.replace("", np.nan, inplace=True)
monthly_cumulative = monthly_cumulative.dropna()

monthly_cumulative.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13404 entries, 22 to 15135
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   lgd_district_id  13404 non-null  object 
 1   year             13404 non-null  int32  
 2   month            13404 non-null  int32  
 3   total_cases      13404 non-null  float64
 4   total_deaths     13404 non-null  float64
 5   new_cases        13404 non-null  float64
 6   new_deaths       13404 non-null  float64
dtypes: float64(4), int32(2), object(1)
memory usage: 733.0+ KB


In [5]:
# Get shapefile in state level
district_shp = gpd.read_file("./district_key_table_shp/district_key_table_shp.shp")

# Check the "district_shp" table
district_shp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 721 entries, 0 to 720
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   lgd_sta_id  721 non-null    object  
 1   lgd_sta_na  721 non-null    object  
 2   lgd_dis_id  721 non-null    object  
 3   lgd_dis_na  721 non-null    object  
 4   pc11_st_id  721 non-null    object  
 5   pc11_di_id  721 non-null    object  
 6   pc11_st_na  721 non-null    object  
 7   pc11_di_na  721 non-null    object  
 8   geometry    721 non-null    geometry
dtypes: geometry(1), object(8)
memory usage: 50.8+ KB


In [6]:
# Merge the above two tables (2018)
covid_total_mortality_perDistrict_perMonth_shp = pd.merge(district_shp, monthly_cumulative, left_on="lgd_dis_id", right_on="lgd_district_id", how="left")
covid_total_mortality_perDistrict_perMonth_shp = covid_total_mortality_perDistrict_perMonth_shp.dropna()

# Check the "covid_total_mortality_perDistrict_perMonth_shp" table
covid_total_mortality_perDistrict_perMonth_shp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 13256 entries, 0 to 13299
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   lgd_sta_id       13256 non-null  object  
 1   lgd_sta_na       13256 non-null  object  
 2   lgd_dis_id       13256 non-null  object  
 3   lgd_dis_na       13256 non-null  object  
 4   pc11_st_id       13256 non-null  object  
 5   pc11_di_id       13256 non-null  object  
 6   pc11_st_na       13256 non-null  object  
 7   pc11_di_na       13256 non-null  object  
 8   geometry         13256 non-null  geometry
 9   lgd_district_id  13256 non-null  object  
 10  year             13256 non-null  float64 
 11  month            13256 non-null  float64 
 12  total_cases      13256 non-null  float64 
 13  total_deaths     13256 non-null  float64 
 14  new_cases        13256 non-null  float64 
 15  new_deaths       13256 non-null  float64 
dtypes: float64(6), geometry(1), object(9)

In [7]:
covid_total_mortality_perDistrict_perMonth_shp.drop(columns=["lgd_district_id"], inplace=True)
covid_total_mortality_perDistrict_perMonth_shp.rename(columns={"total_cases": "total_case",
                                                               "total_deaths": "total_deat"}, inplace=True)

# Check the "mortality_perDistrict_perMonth_shp" table
covid_total_mortality_perDistrict_perMonth_shp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 13256 entries, 0 to 13299
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   lgd_sta_id  13256 non-null  object  
 1   lgd_sta_na  13256 non-null  object  
 2   lgd_dis_id  13256 non-null  object  
 3   lgd_dis_na  13256 non-null  object  
 4   pc11_st_id  13256 non-null  object  
 5   pc11_di_id  13256 non-null  object  
 6   pc11_st_na  13256 non-null  object  
 7   pc11_di_na  13256 non-null  object  
 8   geometry    13256 non-null  geometry
 9   year        13256 non-null  float64 
 10  month       13256 non-null  float64 
 11  total_case  13256 non-null  float64 
 12  total_deat  13256 non-null  float64 
 13  new_cases   13256 non-null  float64 
 14  new_deaths  13256 non-null  float64 
dtypes: float64(6), geometry(1), object(8)
memory usage: 1.6+ MB


In [8]:
# Export the "mortality_perDistrict_perMonth_shp" table to shapefile
os.mkdir("./covid_total_mortality_perDistrict_perMonth_shp")
covid_total_mortality_perDistrict_perMonth_shp.to_file(
    "./covid_total_mortality_perDistrict_perMonth_shp/covid_total_mortality_perDistrict_perMonth_shp.shp", driver="ESRI Shapefile")