![](CRISP_DM.png)

In [None]:
#Importing required packages
import os
import pandas as pd
import numpy as np
from sklearn import linear_model
from scipy import signal
%matplotlib inline
pd.set_option('display.max_rows', 500)

In [None]:
#Set a base path in such way that full execuation will be possible with one click
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir('C:/Users/dhame/ds_covid-19/')

'Your base path for this project is: '+os.path.split(os.getcwd())[-1]

## 3.1 Applying 'groupby' on large relational dataset

In [None]:
# dataframe creating using relational data frame of last notebook where all the data is sorted
pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
# start data value from ascending order and reset the index
pd_JH_data=pd_JH_data.sort_values('date',ascending=True).reset_index(drop=True).copy()
pd_JH_data.head()

## 3.2 Applying 'groupby' on Test data

In [None]:
# creating dataframe with data of US and Germany after 20.03.2020
test_data=pd_JH_data[((pd_JH_data['country']=='US')|
                      (pd_JH_data['country']=='Germany'))&
                     (pd_JH_data['date']>'2020-03-20')]
test_data.tail()

In [None]:
#use groupby to check both countries max number of infected cases 
test_data.groupby(['country']).agg(np.max)

### 3.2.1 Doubling time via regression_ calculation

In [None]:
reg = linear_model.LinearRegression(fit_intercept=True)
#defining a function for calculating doubling time
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    return intercept/slope

### 3.2.3 Rolling regression
#### # Thoery for understanding merge concept in python
<font color=green> **Inner Merge / Inner join** – The default Pandas behaviour, only keep rows where the merge “on” value exists in both the left and right dataframes.  
   **Left Merge / Left outer join** – (aka left merge or left join) Keep every row in the left dataframe. Where there are missing values of the “on” variable in the right dataframe, add empty / NaN values in the result.    
   **Right Merge / Right outer join** – (aka right merge or right join) Keep every row in the right dataframe. Where there are missing values of the “on” variable in the left column, add empty / NaN values in the result.  
   **Outer Merge / Full outer join** – A full outer join returns all the rows from the left dataframe, all the rows from the right dataframe, and matches up rows where possible, with NaNs elsewhere.</font>

In [None]:
#Defining function for calculation of rolling regression
def rolling_regression(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

In [None]:
# groupby data using 'state' & 'country' columns and than apply rolling regerssion
test_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_regression,'confirmed');

In [None]:
# apply rolling regression on our main dataframe and don't forget to reset index
pd_DR_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_regression,'confirmed').reset_index()

In [None]:
# rename the column name for convenience
pd_DR_result=pd_DR_result.rename(columns={'confirmed':'confirmed_DR','level_2':'index'})
pd_JH_data=pd_JH_data.reset_index()
pd_JH_data.head()

In [None]:
# now merge the result of rolling regression with main data frame using common column named index column
pd_result_larg=pd.merge(pd_JH_data,pd_DR_result[['index','confirmed_DR']],on=['index'],how='left')
pd_result_larg.head()

## 3.3 Applying Filter (savgol) on data

In [None]:
# define savgol_filter for filtering the data with groupby function
def savgol_fil(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    # fillup empty raw with 0 value in dataframe
    filter_in=df_input[column].fillna(0) 
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result

In [None]:
# first apply groupby using state and country column data and than apply above function
pd_filtered_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(savgol_fil).reset_index()

In [None]:
# merge resulted value to large dataframe using common column name index
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_result[['index','confirmed_filtered']],on=['index'],how='left')
pd_result_larg.head()

## 3.4 Calculating doubling rate on filtered data

In [None]:
# filtering the doubling rate and save that into new dataframe
pd_filtered_doubling=pd_result_larg[['state','country','confirmed_filtered']].groupby(['state','country'])\
                                .apply(rolling_regression,'confirmed_filtered').reset_index()
pd_filtered_doubling=pd_filtered_doubling.rename(columns={'confirmed_filtered':'confirmed_filtered_DR','level_2':'index'})

In [None]:
# perform mergeing as above using index as column and left 
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_doubling[['index','confirmed_filtered_DR']],on=['index'],how='left')
pd_result_larg.head()

#### Masks in python
<font color=green> # When working with data arrays masks can be extremely useful. Masks are an array of boolean values for which a condition is met.
    </font>

In [None]:
df_mask=pd_result_larg['confirmed']>100
pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(df_maskask, other=np.NaN) 
#checking data for 'Germany' from the end
pd_result_larg[pd_result_larg['country']=='Germany'].tail()

In [None]:
pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)