In [2]:
# Importing Libraries
import pandas as pd

In [3]:
# Nan value tratement from status_change_date column
patients_ds = pd.read_csv(r'patients_data.csv')
patients_ds.dropna(subset=['status_change_date'],inplace=True)

# saving modified patients_dataset
patients_ds.to_csv('patients_ds_mod.csv',index=False)

In [4]:
# Date parser for date columns
mydateparser = lambda x: pd.datetime.strptime(x, "%d-%m-%Y")

# Reading modifies patients ds
patients_ds = pd.read_csv("patients_ds_mod.csv", parse_dates=['date_announced','status_change_date'], date_parser=mydateparser)

In [5]:
# Date column formatting as per NobBS
patients_ds['date_announced'] = pd.to_datetime(patients_ds['date_announced'],format = '%Y-%m-%d' ).dt.date
patients_ds['status_change_date'] = pd.to_datetime(patients_ds['status_change_date'],format = '%Y-%m-%d').dt.date

In [6]:
patients_ds.head()

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
0,1,KL-TS-P1,2020-01-30,20.0,F,Thrissur,Thrissur,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,
1,2,KL-AL-P1,2020-02-02,,,Alappuzha,Alappuzha,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan
2,3,KL-KS-P1,2020-02-03,,,Kasaragod,Kasaragod,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan
3,4,DL-P1,2020-03-02,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,DL,Recovered,"Travelled from Austria, Italy",,India,Imported,2020-03-15,https://www.indiatoday.in/india/story/not-a-ja...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria
4,5,TS-P1,2020-03-02,24.0,M,Hyderabad,Hyderabad,Telangana,TG,Recovered,"Travelled from Dubai to Bangalore on 20th Feb,...",,India,Imported,2020-03-02,https://www.deccanherald.com/national/south/qu...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact"


In [7]:
# Reading districs dataset
# Note: I manually mapped districts in excel on the basis of population density,
# please refer to my_covid_district_dataset.xlsx for more details on mapping
dense_district_ds = pd.read_csv(r'pop_density_mapped_district.csv')

In [8]:
dense_district_ds.head(2)

Unnamed: 0,District,Person,Male,Female,Sex Ratio,Density,mapped_district
0,Papum Pare,121750,64122,57628,899,35,North Cachar Hills
1,Uttarkashi,294179,151599,142580,941,37,North Cachar Hills


In [9]:
# Filtering only required columns
patients_ds = patients_ds[['date_announced','status_change_date','age_bracket','gender','detected_district','detected_city','detected_state','state_code']]

In [19]:
# Merging patients and mapped district dataset on the basis of districts
den_final_df = pd.merge(patients_ds,dense_district_ds,left_on='detected_district',right_on='District')

In [20]:
den_final_df.shape

(1952, 15)

In [21]:
# Checking for Nan values in Gnder column
den_final_df['gender'].value_counts(dropna=False)

NaN    1279
M       480
F       193
Name: gender, dtype: int64

In [22]:
# Replacing NaN values in Gender with M
den_final_df['gender'].fillna(value='M',inplace=True)

In [32]:
den_final_df['gender'] = den_final_df['gender'].map({'M':'Male','F':'Female'})

In [None]:
# Renaming columns as per NobBS R Code requirement
den_final_df.rename(columns={'date_announced':'report_week'},inplace=True)
#den_final_df.rename(columns={'date_announced':'onset_week','status_change_date':'report_week'},inplace=True)

In [25]:
from datetime import datetime, timedelta
# den_final_df['onset_week'] = den_final_df['report_week']
den_final_df['onset_week'] = den_final_df['report_week'] - timedelta(days=14)

In [26]:
den_final_df.head()

Unnamed: 0,report_week,status_change_date,age_bracket,gender,detected_district,detected_city,detected_state,state_code,District,Person,Male,Female,Sex Ratio,Density,mapped_district,onset_week
0,2020-02-03,2020-02-14,,M,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-01-20
1,2020-03-16,2020-03-16,,M,Kasaragod,Kalanadu,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-02
2,2020-03-19,2020-03-19,47.0,M,Kasaragod,Eriyal,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-05
3,2020-03-20,2020-03-20,52.0,M,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-06
4,2020-03-20,2020-03-20,27.0,M,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-06


In [34]:
# Saving Input file for R Code
den_final_df[['onset_week','report_week','gender']].to_csv('covid_data_April21.csv',index=False)

In [33]:
den_final_df.head()

Unnamed: 0,report_week,status_change_date,age_bracket,gender,detected_district,detected_city,detected_state,state_code,District,Person,Male,Female,Sex Ratio,Density,mapped_district,onset_week
0,2020-02-03,2020-02-14,,Male,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-01-20
1,2020-03-16,2020-03-16,,Male,Kasaragod,Kalanadu,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-02
2,2020-03-19,2020-03-19,47.0,Male,Kasaragod,Eriyal,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-05
3,2020-03-20,2020-03-20,52.0,Male,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-06
4,2020-03-20,2020-03-20,27.0,Male,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-06


In [40]:
# Only for Golaghat district
golaghat_df = den_final_df[den_final_df['mapped_district'] == 'Golaghat']

In [42]:
golaghat_df[['onset_week','report_week','gender']].to_csv('golaghat_data_April21.csv',index=False)

In [43]:
golaghat_df.to_csv('golaghat_full.csv',index=False)