In [7]:
import pandas as pd
from collections import defaultdict
import warnings
import numpy as np

data = pd.read_csv(r"../../datasets/Data/india_stats.csv")
data.head()

warnings.filterwarnings('ignore')

In [8]:
data.drop(["ConfirmedIndianNational","ConfirmedForeignNational","Sno","Time","Deaths","Cured"],axis=1,inplace=True)
data.head()

Unnamed: 0,Date,State/UnionTerritory,Confirmed
0,31-01-2020,Kerala,1
1,01-02-2020,Kerala,2
2,02-02-2020,Kerala,3
3,03-02-2020,Kerala,3
4,04-02-2020,Kerala,3


In [9]:
kldata=data[data["State/UnionTerritory"]=="Kerala"]
tndata=data[data["State/UnionTerritory"]=="Tamil Nadu"]
mhdata=data[data["State/UnionTerritory"]=="Maharashtra"]

In [10]:
def new_cases(df):
    '''calc new cases'''
    
    #shift by 1 row
    shift = df.Confirmed.shift(1)
    
    df['new_cases'] = df.Confirmed - shift
    df.dropna(inplace=True)
    df['new_cases'] = list(map(int,df.new_cases))
    return df

kldata=new_cases(kldata)
tndata=new_cases(tndata)
mhdata=new_cases(mhdata)

In [11]:
# state={"Kerala":kldata,"Tamil Nadu":tndata,"Maharashtra":mhdata}



# for i in state:
#     print(f"length of {i} : {state[i].shape}")
#     print("Dates are unique : ",len(state[i]["Date"].unique())==state[i].shape[0] ) #Checking for duplicates in date 
#     print("")

In [12]:
kldata.head()

Unnamed: 0,Date,State/UnionTerritory,Confirmed,new_cases
1,01-02-2020,Kerala,2,1
2,02-02-2020,Kerala,3,1
3,03-02-2020,Kerala,3,0
4,04-02-2020,Kerala,3,0
5,05-02-2020,Kerala,3,0


In [13]:
# kldata.to_csv(r"../../datasets/processed/kl_cases.csv")
# tndata.to_csv(r"../../datasets/processed/tn_cases.csv")
# mhdata.to_csv(r"../../datasets/processed/mh_cases.csv")

In [14]:
mhdata.head()

Unnamed: 0,Date,State/UnionTerritory,Confirmed,new_cases
90,10-03-2020,Maharashtra,5,3
96,11-03-2020,Maharashtra,8,3
119,12-03-2020,Maharashtra,11,3
132,13-03-2020,Maharashtra,14,3
145,14-03-2020,Maharashtra,14,0


In [15]:
tnclim = pd.read_csv(r"..\..\datasets\data\tn_climate.csv")
klclim = pd.read_csv(r"..\..\datasets\data\kl_climate.csv")
mhclim = pd.read_csv(r"..\..\datasets\data\mh_climate.csv")

In [16]:
def clean(clim):
    count=0
    finalclim = pd.DataFrame()
    time=["08:30","17:30","23:30"]
    
    dt=defaultdict(int)
    dh=defaultdict(int)
    
    for i in range(len(clim)):
        tmp=clim.iloc[i,0]
        for j in time:
            if j in tmp:
                count+=1
                dt[clim.iloc[i,0][0:10]]+=clim.iloc[i,1]
                dh[clim.iloc[i,0][0:10]]+=clim.iloc[i,2]

    
    for i in dt:
        dt[i]=round(dt[i]/3,2)
    for i in dh:
        dh[i]=round(dh[i]/3,0)
    
    for i in dt:
        row=[]
        row.append(i)
        row.append(dt[i])
        row.append(dh[i])
        finalclim=finalclim.append([row])
        
    finalclim.rename(columns={0:"Date",1:"T",2:"U"},inplace=True)
    
    finalclim["T"].fillna(value=finalclim["T"].mean(),inplace=True)
    finalclim["U"].fillna(value=finalclim["U"].mean(),inplace=True)
    
    return finalclim.loc[::-1]

climtnclean=clean(tnclim)
climklclean=clean(klclim)
climmhclean=clean(mhclim)



In [17]:
climmhclean.to_csv(r"..\..\datasets\processed\mh_clim.csv")
climklclean.to_csv(r"..\..\datasets\processed\kl_clim.csv")
climtnclean.to_csv(r"..\..\datasets\processed\tn_clim.csv")

In [18]:
def addfeatures(sour,dest):
    t=list(sour["T"])
    h=list(sour["U"])
    if len(t)<len(dest):
        for i in range(len(dest)-len(t)):
            t.append(round(sum(t)/len(t),2))
            h.append(round(sum(h)/len(h),0))
    
    else:
        for i in range(len(t)-len(dest)):
            t.pop()
            h.pop()
            
    dest["T"]=t
    dest["H"]=h
    return dest

In [19]:
tnfinal = addfeatures(climtnclean,tndata)
klfinal = addfeatures(climklclean,kldata)
mhfinal = addfeatures(climmhclean,mhdata)

In [20]:
def trimdate(df):
    ndf = pd.DataFrame()
    for i in range(len(df)):
        dat = df.iloc[i,0].split("-")
        ndat = "/".join(dat)
        row = [ndat,df.iloc[i,1],df.iloc[i,2],df.iloc[i,3],df.iloc[i,4],df.iloc[i,5]]
        ndf=ndf.append([row])
    ndf.rename(columns={0:"Date",1:"State",2:"Confirmed",3:"New Cases",4:"T",5:"H"},inplace=True)
    return ndf 

In [26]:
tntrimmed = trimdate(tnfinal)
mhtrimmed = trimdate(mhfinal)
kltrimmed = trimdate(klfinal)

# tntrimmed.to_csv(r"..\..\datasets\processed\tn.csv",index=False)
# mhtrimmed.to_csv(r"..\..\datasets\processed\mh.csv",index=False)
# kltrimmed.to_csv(r"..\..\datasets\processed\kl.csv",index=False)


In [32]:
def rt_columns(df):
    ndf = pd.DataFrame()
    ndf["dates"]=df["Date"]
    ndf["I"]=df["New Cases"]
    return ndf

In [33]:
tnrtdata=rt_columns(tntrimmed)
tnrtdata.to_csv("..\\..\\datasets\\rt\\tnrtdata.csv",index=False)
klrtdata=rt_columns(kltrimmed)
klrtdata.to_csv("..\\..\\datasets\\rt\\klrtdata.csv",index=False)
mhrtdata=rt_columns(mhtrimmed)
mhrtdata.to_csv("..\\..\\datasets\\rt\\mhrtdata.csv",index=False)

In [37]:
tndata.tail()

Unnamed: 0,Date,State/UnionTerritory,Confirmed,new_cases,T,H
17959,07-08-2021,Tamil Nadu,2571383,1985,30.8,72.0
17995,08-08-2021,Tamil Nadu,2573352,1969,31.07,75.0
18031,09-08-2021,Tamil Nadu,2575308,1956,31.07,78.0
18067,10-08-2021,Tamil Nadu,2577237,1929,30.73,72.0
18103,11-08-2021,Tamil Nadu,2579130,1893,30.2,78.0


In [38]:
kldata.tail()

Unnamed: 0,Date,State/UnionTerritory,Confirmed,new_cases,T,H
17945,07-08-2021,Kerala,3513551,19948,26.33,93.0
17981,08-08-2021,Kerala,3533918,20367,27.0,91.0
18017,09-08-2021,Kerala,3552525,18607,27.33,89.0
18053,10-08-2021,Kerala,3565574,13049,26.67,89.0
18089,11-08-2021,Kerala,3586693,21119,27.0,89.0


In [39]:
mhdata.tail()

Unnamed: 0,Date,State/UnionTerritory,Confirmed,new_cases,T,H
17949,07-08-2021,Maharashtra,6341759,5539,29.13,77.0
17985,08-08-2021,Maharashtra,6347820,6061,29.07,80.0
18021,09-08-2021,Maharashtra,6353328,5508,29.33,76.0
18057,10-08-2021,Maharashtra,6357833,4505,29.0,77.0
18093,11-08-2021,Maharashtra,6363442,5609,28.87,75.0


In [31]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.
