#### Import modules

In [1]:
import json 
import pandas as pd
import numpy as np
import datetime

#### Load Q1 json

In [2]:
# to open json file
f=open('neighbor-districts-modified.json')
# this function basically stores json type files into python dictionary
dist_modified=json.load(f)

#### Get district names and ids form Q1 json

In [3]:
district_list_from_json=[]
for key in dist_modified:
    district_list_from_json.append(key)
district_list_from_json=np.array(district_list_from_json)
district_list_from_json.sort()

state_district_codes=[]
for i in range(len(district_list_from_json)):
    state_district_codes.append(district_list_from_json[i].split('/')[1])

# district names - sample entry: churu
district_names_from_json=[] 

# district ids - sample entry: RJ_Churu
district_ids_from_json=[]

#use split() function and specify the separator '/' . Remember default seperator is whitspace
for i in range(len(district_list_from_json)):
    district_names_from_json.append(district_list_from_json[i].split("/")[0])
    district_ids_from_json.append(district_list_from_json[i].split("/")[1])

In [4]:
district_ids_list={}
for i in range(len(district_names_from_json)):
    district_ids_list[district_ids_from_json[i]]=district_names_from_json[i] 

#### Creating time ids
- [x] Start date: 2020-03-15
- [x] End date: 2021-08-14

In [5]:
# dictionaries for mapping dates to time ids
# eg. for 2020-3-15, time_id_week is 1, time_id_month is 1, time_id_ is 1

time_id_week = {}
time_id_month = {}
time_id_overall = {}

date=datetime.date(2020,3,15)
day=1

while True:
    # basically to cover overlapping weeks this part needs to be changed.
    # for now we are proceeding. but change week ids according above definition of 7-DMA
    time_id_week[str(date)]=int(np.ceil(day/7))
    
    if str(date)[0:4]=='2020':
        if int(str(date)[8:10]) <15:
            time_id_month[str(date)]=int(str(date)[5:7])-3
        else:
            time_id_month[str(date)]=int(str(date)[5:7])-2
    else:
        if int(str(date)[8:10]) <15:
            time_id_month[str(date)]=int(str(date)[5:7])+9
        else:
            time_id_month[str(date)]=int(str(date)[5:7])+10

    time_id_overall[str(date)]=1
    
    if date==datetime.date(2021,8,14):
        break
    day=day+1
    date=date+datetime.timedelta(days=1)

#### Read districts csv for cases

In [6]:
cols=['Date', 'State', 'District', 'Confirmed']
data_csv=pd.read_csv('districts.csv',usecols=cols)

In [7]:
data_csv=data_csv.sort_values(['District','Date'])
data_csv.reset_index(inplace=True,drop=True) #should not forget it

#### Getting intersection district names from districts csv and modified json

In [8]:
district_names_from_cases=[]
district_ids_from_cases=[]
district_uniques=np.array(np.unique(data_csv['District']))
for i in range(len(district_ids_from_json)):
    a=district_ids_from_json[i].split('_')
    for j in range(len(district_uniques)):
        if district_uniques[j]==a[1]:
            district_ids_from_cases.append(district_ids_from_json[i])
            district_names_from_cases.append(a[1])
            break

In [9]:
print("these many districts are there in districts.csv =",len(district_uniques)+5) # for those common five names
print("these many districts are in both files intersection =" ,len(district_ids_from_cases)) # though they are included in here

these many districts are there in districts.csv = 648
these many districts are in both files intersection = 622


In [10]:
print('-------To check whether things are consistent------')
print(district_ids_from_cases[67])
print(district_names_from_cases[67])

-------To check whether things are consistent------
MH_Beed
Beed


#### To get daily cases from cummulative confirmed cases
- [x] Formula use: cases in present day = present day cases - previous day cases

In [11]:
%%time
# getting daily cases
data_csv['Daily Cases']=np.nan
for i in range(len(district_names_from_cases)):
    foo_df = data_csv[data_csv['District']==district_names_from_cases[i]]
    foo_cases = foo_df.iloc[0,3]
    foo_df['Daily Cases']=foo_df['Confirmed'].diff()
    foo_df.iloc[0,4]=foo_cases
    data_csv.loc[data_csv['District']==district_names_from_cases[i],'Daily Cases']=foo_df['Daily Cases']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


CPU times: user 18.4 s, sys: 3.69 ms, total: 18.4 s
Wall time: 18.4 s


#### these many rows can be dropped related to districts that are not in intersection

In [12]:
data_csv.isnull().sum() #these many interies are NaN for those districts

Date               0
State              0
District           0
Confirmed          0
Daily Cases    17209
dtype: int64

In [13]:
data_csv.dropna(inplace=True)

#### Giving week and month ID to each date

In [14]:
dates_in_raw=np.unique(data_csv['Date']).tolist()

In [15]:
%%time
#give week and month id to dates
data_csv['Week ID']=np.nan
data_csv['Month ID']=np.nan
for date in time_id_week:
    if dates_in_raw.count(date)>0:
        data_csv.loc[data_csv['Date']==date,'Week ID']=time_id_week[date]
        data_csv.loc[data_csv['Date']==date,'Month ID']=time_id_month[date]

CPU times: user 15.8 s, sys: 3.96 ms, total: 15.8 s
Wall time: 15.8 s


#### Dropping rows with out of range dates
- [x] These many dates are after end date: 2021-08-14 (for all districts in intersection)

In [16]:
data_csv.isnull().sum()

Date               0
State              0
District           0
Confirmed          0
Daily Cases        0
Week ID        11818
Month ID       11818
dtype: int64

In [17]:
data_csv.dropna(inplace=True)

#### For week wise cases for all districts in intersection

- [x] Delhi also can be included but I dont't think I should write special code just to include one District.

In [18]:
%%time
no_of_weeks=list(time_id_week.values())[-1]
districtid=[]
weekid=[]
cases=[]
for i in range(len(district_names_from_cases)): # if delhi etc are not there just 
    foo=data_csv[data_csv['District']==district_names_from_cases[i]]
    for j in range(no_of_weeks):
        districtid.append(district_ids_from_cases[i])
        weekid.append(j+1)
        cases.append(foo[foo['Week ID']==j+1]['Daily Cases'].sum())
        
week_df=pd.DataFrame({'districtid':districtid,'weekid':weekid,'cases':cases})
week_df.sort_values("districtid",axis=0,ascending=True, inplace=True, kind='mergesort')
week_df.reset_index(inplace=True,drop=True)
week_df.to_csv('cases-week.csv',index=False) 

CPU times: user 27.4 s, sys: 15 µs, total: 27.4 s
Wall time: 27.4 s


#### For month wise cases for all districts in intersection

In [19]:
%%time
no_of_months=list(time_id_month.values())[-1]
districtid=[]
monthid=[]
cases=[]
for i in range(len(district_names_from_cases)): # if delhi etc are not there just 
    data_foo=data_csv[data_csv['District']==district_names_from_cases[i]]
    for j in range(no_of_months):
        districtid.append(district_ids_from_cases[i])
        monthid.append(j+1)
        cases.append(data_foo[data_foo['Month ID']==j+1]['Daily Cases'].sum())
        
month_df=pd.DataFrame({'districtid':districtid,'monthid':monthid,'cases':cases})
month_df.sort_values("districtid",axis=0,ascending=True, inplace=True, kind='mergesort')
month_df.reset_index(inplace=True,drop=True)
month_df.to_csv('cases-month.csv',index=False) 

CPU times: user 11.9 s, sys: 0 ns, total: 11.9 s
Wall time: 11.9 s


#### Overall cases for all districts in intersection

In [20]:
%%time
districtid=[]
overallid=[]
cases=[]

for i in range(len(district_names_from_cases)):
    #for j in range(len(np.unique(raw['Week ID']))):
    districtid.append(district_ids_from_cases[i])
    overallid.append(1)
    cases.append(data_csv[data_csv['District']==district_names_from_cases[i]]['Daily Cases'].sum())
        
overall_df=pd.DataFrame({'districtid':districtid,'overallid':overallid,'cases':cases})
overall_df.sort_values("districtid",axis=0,ascending=True, inplace=True, kind='mergesort')
overall_df.reset_index(inplace=True,drop=True)
overall_df.to_csv('cases-overall.csv',index=False) 

CPU times: user 7.37 s, sys: 0 ns, total: 7.37 s
Wall time: 7.37 s
