In [2]:
#filtering 3 states new case from particular range of date

import pandas as pd
from collections import defaultdict
import warnings
import numpy as np

data = pd.read_csv(r"datasets\Data\india_stats.csv")
data.head()

warnings.filterwarnings('ignore')

In [3]:
data.drop(["ConfirmedIndianNational","ConfirmedForeignNational","Sno","Time","Deaths","Cured"],axis=1,inplace=True)
data.head()

Unnamed: 0,Date,State/UnionTerritory,Confirmed
0,31-01-2020,Kerala,1
1,01-02-2020,Kerala,2
2,02-02-2020,Kerala,3
3,03-02-2020,Kerala,3
4,04-02-2020,Kerala,3


In [5]:
#Grouping 3 different states

kldata=data[data["State/UnionTerritory"]=="Kerala"]
tndata=data[data["State/UnionTerritory"]=="Tamil Nadu"]
mhdata=data[data["State/UnionTerritory"]=="Maharashtra"]

In [6]:
#fuction to find new cases from confirmed cases
def new_cases(df):
    '''calc new cases'''
    
    #shift by 1 row
    shift = df.Confirmed.shift(1)
    
    df['new_cases'] = df.Confirmed - shift
    df.dropna(inplace=True)
    df['new_cases'] = list(map(int,df.new_cases))
    return df

kldata=new_cases(kldata)
tndata=new_cases(tndata)
mhdata=new_cases(mhdata)

In [7]:
#saving the state-wise datasets
kldata.to_csv(r"datasets\processed\kl_cases.csv")
tndata.to_csv(r"datasets\processed\tn_cases.csv")
mhdata.to_csv(r"datasets\processed\mh_cases.csv")

In [9]:
tndata.head()

Unnamed: 0,Date,State/UnionTerritory,Confirmed,new_cases
66,08-03-2020,Tamil Nadu,1,0
78,09-03-2020,Tamil Nadu,1,0
93,10-03-2020,Tamil Nadu,1,0
104,11-03-2020,Tamil Nadu,1,0
115,12-03-2020,Tamil Nadu,1,0


In [11]:
#Preprocessing the climate dataset 
tnclim = pd.read_csv(r"datasets\data\tn_climate.csv")
klclim = pd.read_csv(r"datasets\data\kl_climate.csv")
mhclim = pd.read_csv(r"datasets\data\mh_climate.csv")

In [13]:
tnclim.tail()

Unnamed: 0,Date,T,U
4156,08.03.2020 14:30,31.6,57.0
4157,08.03.2020 11:30,32.6,58.0
4158,08.03.2020 08:30,29.4,74.0
4159,08.03.2020 05:30,26.8,84.0
4160,08.03.2020 02:30,26.4,87.0


In [12]:
#Function to take average of temp and humidity at three given time period

def clean(clim):
    count=0
    finalclim = pd.DataFrame()
    time=["08:30","11:30","14:30"]
    
    dt=defaultdict(int)
    dh=defaultdict(int)
    
    for i in range(len(clim)):
        tmp=clim.iloc[i,0]
        for j in time:
            if j in tmp:
                count+=1
                dt[clim.iloc[i,0][0:10]]+=clim.iloc[i,1]
                dh[clim.iloc[i,0][0:10]]+=clim.iloc[i,2]

    for i in dt:
        dt[i]=round(dt[i]/3,2)
    for i in dh:
        dh[i]=round(dh[i]/3,0)
    
    for i in dt:
        row=[]
        row.append(i)
        row.append(dt[i])
        row.append(dh[i])
        finalclim=finalclim.append([row])
        
    finalclim.rename(columns={0:"Date",1:"T",2:"U"},inplace=True)
    
    finalclim["T"].fillna(value=finalclim["T"].mean(),inplace=True)
    finalclim["U"].fillna(value=finalclim["U"].mean(),inplace=True)
    
    return finalclim.loc[::-1]

#cleaning the climate dataset

climtnclean=clean(tnclim)
climklclean=clean(klclim)
climmhclean=clean(mhclim)

In [None]:
#Saving the cleant climate datasets
climmhclean.to_csv(r"datasets\processed\mh_clim.csv")
climklclean.to_csv(r"datasets\processed\kl_clim.csv")
climtnclean.to_csv(r"datasets\processed\tn_clim.csv")

In [15]:
#Adding climate features to new cases dataset
def addfeatures(sour,dest):
    t=list(sour["T"])
    h=list(sour["U"])
    if len(t)<len(dest):
        for i in range(len(dest)-len(t)):
            t.append(round(sum(t)/len(t),2))
            h.append(round(sum(h)/len(h),0))
    
    else:
        for i in range(len(t)-len(dest)):
            t.pop()
            h.pop()
            
    dest["T"]=t
    dest["H"]=h
    return dest

tnfinal = addfeatures(climtnclean,tndata)
klfinal = addfeatures(climklclean,kldata)
mhfinal = addfeatures(climmhclean,mhdata)

In [16]:
#function to trim records after DEC-31-2020
def trimdate(df):
    ndf = pd.DataFrame()
    for i in range(len(df)):
        if df.iloc[i,0]=='01-01-2021':
            ndf.rename(columns={0:"Date",1:"State",2:"Confirmed",3:"New Cases",4:"T",5:"H"},inplace=True)
            return ndf
        else:
            dat = df.iloc[i,0].split("-")
            ndat = "/".join(dat)
            row = [ndat,df.iloc[i,1],df.iloc[i,2],df.iloc[i,3],df.iloc[i,4],df.iloc[i,5]]
            ndf=ndf.append([row])
            
tntrimmed = trimdate(tnfinal)
mhtrimmed = trimdate(mhfinal)
kltrimmed = trimdate(klfinal)

tntrimmed.to_csv(r"datasets\processed\tn.csv")
mhtrimmed.to_csv(r"datasets\processed\mh.csv")
kltrimmed.to_csv(r"datasets\processed\kl.csv")
            