![](CRISP_DM.png)

In [None]:
#Importing required packages
import os
import pandas as pd
import numpy as np
# import package for linear regression
from sklearn import linear_model
from scipy import signal
# for plotting
%matplotlib inline
pd.set_option('display.max_rows', 500)
import plotly.graph_objects as go
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (16, 10)

In [None]:
#Set a base path in such way that full execuation will be possible with one click
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir('C:/Users/dhame/ds_covid-19/')

'Your base path for this project is: '+os.path.split(os.getcwd())[-1]

In [None]:
# create dataframe from csv file
df_analyse=pd.read_csv('data/processed/COVID_small_flat_table.csv',sep=';')  
# Same as earlier notebook settings set date from starting in ascending order
df_analyse.sort_values('date',ascending=True).head()
#create country_list
country_list=df_analyse.columns[1:]

## 6.1 Helper functions
* Create function in order to plot different values quickly and simililar format evaluation of a time series dataset.
* In python function is a block of code which only runs when it is called. You can pass data, known as parameters, into a function. A function can return data, draw plot or perform action specified in function as a result.

In [None]:
# define quick_plotting function to plot time series dataset
def quick_plotting(x_in, df_input,y_scale='log',slider=False):
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(x=x_in, y=df_input[each], name=each, opacity=1.0))
    
    fig.update_layout(autosize=True, width=800,height=800,xaxis_title = 'Timeline in days', 
                      yaxis_title = 'Confirmed infected people (Source:Johns-hopkins CSSE, log-scale)', 
                      font=dict(family="PT Sans, monospace",size=14,color="#850303"))
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,nticks=20,tickfont=dict(size=12,color="#850303"))
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()

In [None]:
#test above created helper function
quick_plotting(df_analyse.date,df_analyse.iloc[:,1:],y_scale='linear', slider=True)

### 6.1.1 Modify dataframe stucture

In [None]:
# set threshold value for number of country (we want more than 100 countries's data)
threshold=100
# create empty list
empty_list=[]
# add data of more than 100 countries into empty list
for pos,country in enumerate(df_analyse.columns[1:]):
    empty_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))
#convert list into dataframe and trnaspose it to put name of every country in column
pd_sync_timeline=pd.DataFrame(empty_list,index=df_analyse.columns[1:]).T
# add date column starting from 0
pd_sync_timeline['date']=np.arange(pd_sync_timeline.shape[0])
pd_sync_timeline.head()

In [None]:
# use helper function
quick_plotting(pd_sync_timeline.date,pd_sync_timeline.iloc[:,:-1],y_scale='log',slider=True)

## 6.2  Doubling Rate
*  Formula: $N(t)=N_0*2^{t/T}$
* The doubling time is time it takes for a population to double in size/value. It is applied to population growth, inflation, resource extraction, consumption of goods, compound interest, the volume of malignant tumours, and many other things that tend to grow over time. 
<font color=red> (Source:Wiki) <font>

In [None]:
# define function for calculating doubling rate
def calc_doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [None]:
# set max days value
max_days=34
#calculate doubling rate for different amount of interval
norm_slopes={'doubling every two days':calc_doubling_rate(100,np.arange(20),2),
    'doubling every 4 days':calc_doubling_rate(100,np.arange(20),4), 'doubling every 10 days':calc_doubling_rate(100,np.arange(20),10),}

In [None]:
# concatenate dataframes: first is list norm_slopes after converting into dataframe and seconds is our pd_sync_timelines
pd_sync_timeline_with_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timeline], axis=1)
pd_sync_timeline_with_slope.head()

In [None]:
# plot the dataframe with doubling rate
quick_plotting(pd_sync_timeline_with_slope.date,pd_sync_timeline_with_slope.iloc[:,0:5],
           y_scale='log',slider=True)

In [None]:
#save dataframe into CSV format into local drive
pd_sync_timeline_with_slope.to_csv('data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

## 6.3 Linear Regression
* In statistics, linear regression is a linear approach to modeling the relationship between a scalar response and one or more explanatory variables. The case of one explanatory variable is called simple linear regression. For more than one explanatory variable, the process is called multiple linear regression.

In [None]:
reg = linear_model.LinearRegression(fit_intercept=False)
# we want try linear regression for the US 
l_vec=len(df_analyse['US'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['US'][5:]))
#fit the model
reg.fit(X,y)

In [None]:
# calcaute the prediction 
X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)

In [None]:
#Make a copy of old dataframe but including info of date & US column info
LR=df_analyse[['date','US']].copy()
# add predicted value in copied dataframe
LR['prediction']=np.exp(Y_hat)

In [None]:
# plot the orginal dataset and predicted data from linear regression
quick_plotting(LR.date,LR.iloc[:,1:],y_scale='log',slider=True)

## 6.4 Piecewise Linear Regression
* For a relationship between a response variable (Y) and an explanatory variable (X), different linear relationships may apply for different ranges of X. A single linear model will not provide an adequate description of the relationship. Often a non-linear model will be most appropriate in this situation, but sometimes there is a clear break point demarcating two different linear relationships. Piecewise linear regression is a form of regression that allows multiple linear models to be fitted to the data for different ranges of X.

* The regression function at the breakpoint may be discontinuous, but it is possible to specify the model such that the model is continuous at all points. For such a model the two equations for Y need to be equal at the breakpoint. Non-linear least squares regression techniques can be used to fit the model to the data.

In [None]:
regression = linear_model.LinearRegression(fit_intercept=True)

In [None]:
# import CSV file for creating dataframe and make country list by excluding first column
df_analyse=pd.read_csv('data/processed/COVID_small_flat_table.csv',sep=';')  
country_list=df_analyse.columns[1:]

In [None]:
# filter data using window size = 5 for filtering and order of fitted polynomial = 1
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each], 5, 1)

In [None]:
filter_cols=['Italy_filter','US_filter', 'Spain_filter', 'Germany_filter', 'Korea, South_filter']

In [None]:
start_pos=5
quick_plotting(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:], #['US','US_filter']
           y_scale='log',slider=True)

In [None]:
df_analyse.head()

In [None]:
# creating function for calculating doubling rate via regression
def cal_doubling_time_using_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''  
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_  
    return intercept/slope

In [None]:
# creating function for doubling time by using simply numpy package
def cal_doubling_time(in_array):
    ''' Use a classical doubling time formular, see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [None]:
# calculate slope of regression of last x days
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(window=days_back,min_periods=days_back)\
    .apply(cal_doubling_time_using_regression, raw=False)

In [None]:
# run on all filtered data
days_back = 3 
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(window=days_back,min_periods=days_back)\
    .apply(cal_doubling_time_using_regression, raw=False)

In [None]:
# cross check the maths 
df_analyse['Germany_DR_cross_check']=df_analyse['Germany'].rolling(window=days_back,min_periods=days_back)\
    .apply(cal_doubling_time, raw=False)

In [None]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(window=days_back,min_periods=days_back)\
    .apply(cal_doubling_time_using_regression, raw=False)

In [None]:
# check the columns
df_analyse.columns

In [None]:
start_pos=40
quick_plotting(df_analyse.date[start_pos:],df_analyse.iloc[start_pos:,[11,12,13,14]], y_scale='linear',slider=True)

In [None]:
start_pos=40
quick_plotting(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[16,17,18,19]], #17,18,19   # US comparison 12,17
           y_scale='linear',slider=True)