## 9.0 One run full walktrhough 
1. Do the full walk through on the large data set
2. Refactor the source code and bring it to individual scripts
3. Ensure a full run with one click

In [None]:
import os
#Set a base path in such way that full execuation will be possible with one click
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir('C:/Users/dhame/ds_covid-19/')

'Your base path for this project is: '+os.path.split(os.getcwd())[-1]

## 9.1 Update all dataset

In [None]:
# %load src/data/get_rawdata_from_github.py
#import require packages
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import json
import subprocess

#define a function to gather data from johns hopkins by doing git_pull
def get_johns_hopkins_data():
    git_pull = subprocess.Popen('git pull',
                         cwd = os.path.dirname('data/raw/COVID-19/'),
                         shell = True,stdout = subprocess.PIPE,stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()
    print("Error : " + str(error))
    print("out : " + str(out))

# define a function to gather data of only germany from RKI website: Just an an example, this data will be not used in the project
def get_germany_data():
    data_germany=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data_germany.content)
    final_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        final_list.append(each_dict['attributes'])
    pd_final_list=pd.DataFrame(final_list)
    pd_final_list.to_csv('data/raw/NPGEO/Germany_statewise_data.csv',sep=';')
    print(' Number of rows data stored (regionwise): '+str(pd_final_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins_data()
    get_germany_data()


## 9.2 Transform jhon hopkins dataset into relational dataset

In [None]:
# %load src\data\preprocess_on_JH_data.py
#importing required packages
import pandas as pd
import numpy as np
from datetime import datetime

# define function for store relational dataframe of Johns Hopkins data
def store_relational_datafrmae_for_JH_data():
    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)
    pd_data=pd_raw.rename(columns={'Country/Region':'country','Province/State':'state'})
    pd_data['state']=pd_data['state'].fillna('no')
    pd_data=pd_data.drop(['Lat','Long'],axis=1)
    Final_relational_model=pd_data.set_index(['state','country']) .T.stack(level=[0,1]).reset_index().rename(columns={'level_0':'date', 0:'confirmed'})

    Final_relational_model['date']=Final_relational_model.date.astype('datetime64[ns]')
    Final_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Total number of stored rows are: '+str(Final_relational_model.shape[0]))

if __name__ == '__main__':
    store_relational_datafrmae_for_JH_data()


## 9.3 Calculation of Filter and Doubling Rate 

In [None]:
# %load src\features\build_features_for_fitler_&_doubling_rate.py
#importing required packageas
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd
from scipy import signal

# define helper function to calculate doubling rate via regression
def calculate_doubling_time_via_regression(in_array):
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    return intercept/slope

# define helper function to calculate savgol_filter
def savgol_filter(df_input,column='confirmed',window=5):
    degree=1
    df_result=df_input
    # fillup empty raw with 0 value in dataframe
    filter_in=df_input[column].fillna(0)
    # window size is used for filtering
    result=signal.savgol_filter(np.array(filter_in),window,1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg_func(df_input,col='confirmed'):
    days_back=3
    result=df_input[col].rolling(window=days_back,min_periods=days_back)\
                        .apply(calculate_doubling_time_via_regression,raw=False)
    return result

# define helper function to get merged DataFrame
def calc_filtered_data(df_input,filter_on='confirmed'):
    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'
    # Make a copy of df_input here otherwise it will be overwritten
    df_output=df_input.copy()

    df_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()
    df_output=pd.merge(df_output,df_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    return df_output.copy()

# define a function for calculation of doubling rate
def calc_doubling_rate(df_input,filter_on='confirmed'):

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Error in calc_filtered_data not all columns in data frame'
    df_doubling_rate= df_input.groupby(['state','country']).apply(rolling_reg_func,filter_on).reset_index()
    df_doubling_rate=df_doubling_rate.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})
    # Performing merging on the index of big table and on the index column after groupby operation
    df_output=pd.merge(df_input,df_doubling_rate[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])
    return df_output

if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=calculate_doubling_time_via_regression(test_data_reg)
    print('The slope of regression plot is: '+str(result))

    df_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    df_JH_data=df_JH_data.sort_values('date',ascending=True).copy()

    df_result_large=calc_filtered_data(df_JH_data)
    df_result_large=calc_doubling_rate(df_result_large)
    df_result_large=calc_doubling_rate(df_result_large,'confirmed_filtered')

    mask_threshold=df_result_large['confirmed']>100
    df_result_large['confirmed_filtered_DR']=df_result_large['confirmed_filtered_DR'].where(mask_threshold, other=np.NaN)
    df_result_large.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print('This is it, Mate!!! --Calculation is done')


## 9.4 Visual Board_1
* Visual board_1 shows the plots for one or mulitple countries after selection. In addition, you have to select the option  among timeline confirmed, timeline confirmed filtered, timeline doubling rate and timeline doubling rate filtered. 

In [None]:
# %load src\visualization\Dashboard_1.py
#importing required packages
import pandas as pd
import numpy as np
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
import plotly.graph_objects as go
print('Your current dash board version is:' + dash.__version__)

# import local CSV file as a dataframe
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')

# for plotting
fig = go.Figure()
# for dashboard development
app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Data Science Project @ TU_KL on COVID-19 Dataset-Part 1

    * Goal of the project is to learn data science by applying a cross-industry standard process. The default layout
    contains the confirmed infected cases in the log-scale format for options (1 & 2); Approximated doubling rate
    over 3 days for options (3 & 4) on the Y-axis and Timeline in Days on the X-axis.

    ### The first dropdown menu enables selection of one or multiple  countries for visualization. The seconds dropdown menu contains four options:
        1. The ‘Timeline Confirmed’ represents confirmed infected cases along the timeline.
        2. The ‘Timeline Confirmed Filtered’ represents filtered (after applying sav-gol filter) confirmed infected cases along the timeline.
        3. The ‘Timeline Doubling Rate’ represents calculated doubling rate on the infected cases along the timeline from the 1st option.
        4. The ‘Timeline Doubling Rate Filtered’ represents calculated doubling rate on the infected cases along the timeline from the 2nd option.

    '''),

    dcc.Markdown('''
    ## Select the Country for visualization
    '''),

    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['Germany','Italy'], # which are pre-selected in default layout
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',multi=False),dcc.Graph(figure=fig, id='main_window_slope')])

@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure_layout(country_list,show_doubling):
    if 'DR' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
          }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (From johns hopkins csse, log-scale)'}
    traces = []
    for each in country_list:
        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()

        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',opacity=1.0,name=each))

    return {
            'data': traces,
            'layout': dict (
                width=1000,height=650,
                xaxis={'title':'Timeline in the days','tickangle':-45,'nticks':20,
                        'tickfont':dict(size=14,color="#0c6887"),},yaxis=my_yaxis)}

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)


## 9.5 Calculation of SIR optimize parameter and fitted value
* Here the calculation of optimized patameter beta, gamma and fitted value to plot simulated curve with original data

In [None]:
# %load src/models/SIR_Calculation.py
# importing required python packages
import pandas as pd
import numpy as np
from datetime import datetime
from scipy import optimize
from scipy import integrate

#importing data frame
data_raw = pd.read_csv('data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
country_list = data_raw['Country/Region'].unique() #making country_list
date = data_raw.columns[4:]
df_dhameli = pd.DataFrame({'Date': date})

#converting data_raw DataFrame into format that we can use for SIR algorithm
for each in country_list:
    df_dhameli[each] = np.array(data_raw[data_raw['Country/Region'] == each].iloc[:,4::].sum(axis=0)).T
df_dhameli.to_csv('data/processed/SIR.csv', sep = ';', index=False)

df_analyse=pd.read_csv('data/processed/SIR.csv',sep=';')
df_analyse.sort_values('Date',ascending=True).head()

# Intialize parameter
N0 = 1000000
beta = 0.4
gamma = 0.1
I0=df_analyse.Germany[35]
S0=N0-I0
R0=0

df_data = df_analyse[35:] # need to careful here because it difffers from each country!! But I solved it below
t = np.arange(df_data.shape[0])

# defining SIR function
def cal_SIR_model_t(SIR, t, beta, gamma):
    S,I,R=SIR
    dS_dt = -beta*I*S/N0
    dI_dt = beta*I*S/N0 - gamma*I
    dR_dt = gamma*I
    return dS_dt, dI_dt, dR_dt

# defining fit_odeint_func function for optimize parameters
def fit_odeint_func(x, beta, gamma):
    return integrate.odeint(cal_SIR_model_t, (S0, I0, R0), x, args=(beta, gamma))[:,1]

#calculating optimize parameters for every country
for country in df_data.columns[1:]:
        ydata = np.array(df_data[df_data[country]>0][country]) ## consider only value, which greater than zero to solve above mentioned problem
        t = np.arange(len(ydata))
        I0=ydata[0]
        S0=N0-I0
        R0=0
        popt=[0.4,0.1]
        fit_odeint_func(t, *popt)
        popt, pcov = optimize.curve_fit(fit_odeint_func, t, ydata, maxfev=5000)
        perr = np.sqrt(np.diag(pcov))
        val_fitted=fit_odeint_func(t, *popt)
        col_fitted = np.concatenate((np.zeros(df_data.shape[0]-len(val_fitted)) ,val_fitted)) # concatenate fitted and padded array into list
        df_data[country + '_fitted'] = col_fitted

df_data = df_data.reset_index(drop=True)
#save CSV file to local drive for future use
df_data.to_csv('data/processed/SIR_fitted.csv', sep = ';')


## 9.6 Visual board_2
* Visual board_2 shows real data and simulated SIR curve for one or multiple countries according selection of user.

In [None]:
# %load src/visualization/Dashboard_2.py
# importing required python packages
import pandas as pd
import numpy as np
import random
from datetime import datetime
import plotly.graph_objects as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
print('Your current dash board version is:' + dash.__version__)

#importing data frame
df_analyse=pd.read_csv('data/processed/SIR_fitted.csv',sep=';')
df_analyse.sort_values('Date',ascending=True).head()
df_data = df_analyse.reset_index(drop = True)

# for showing same color for each countries both curve, and color will be random at when you update the color list
color_list = []
for i in range(200):
    var = '#%02x%02x%02x'%(random.randint(0,255),random.randint(0,255),random.randint(0,255))
    color_list.append(var)

# creating dashboard app containig plotting for whole dataset
fig = go.Figure()
app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''

    #  Data Science Project @ TU_KL on COVID-19 Dataset-Part 2
    ## Real and simulated number of infected people

    * The default layout contains the confirmed infected cases in the log-scale format on the Y-axis
    and Timeline in Days on the X-axis.
    ### The dropdown menu enables selection of one or multiple countries for visualization.

    * This dashboard plots two curves for each country:

    1. The first curve represents the confirmed infected cases along the timeline.
    2. The second curve represents the simulated infected cases after applying the SIR model along the timeline.

    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),
    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_data.columns[1:200]],
        value=['Germany','France'], # which are pre-selected
        multi=True),dcc.Graph(figure=fig, id='main_window_slope')])

@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value')])
def update_figure_layout(country_list):
    v = 0
    my_yaxis={'type':"log",'title':'Confirmed infected people (From johns hopkins csse, log-scale)'}
    traces = []
    for each in country_list:
        traces.append(dict(x=df_data['Date'],y=df_data[each],
                                mode='line', line = dict(color = color_list[v]), opacity=1.0,name=each))
        traces.append(dict(x=df_data['Date'],
                                y=df_data[each+'_fitted'],
                                mode='markers+lines',line = dict(color=color_list[v]), opacity=1.0,name=each+'_simulated'))

        v = v+1
    return {
            'data': traces,
            'layout': dict (
                width=1000,height=650,
                xaxis={'title':'Timeline in Days','tickangle':-45,'nticks':20,
                'tickfont':dict(size=14,color="#0c6887"),},yaxis=my_yaxis)}

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)
