# One Run Full Walkthrough

- Do the full walk through on the large data set
- Refactor the source code and bring it to individual scripts
- Ensure a full run with one click

In [None]:
# Check some parameters
# Depending where you launch your notebook, the relative path might not work
# You should start the notebook server from your base path
# When opening the notebook, typically your path will be ../ads_covid-19/notebooks
import os
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+os.path.split(os.getcwd())[-1]

## 1 Update All Data

In [None]:
# %load E:/ads_covid-19/IDS_covid-19/src/data/get_data.py
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json


def get_johns_hopkins():
    git_pull = subprocess.Popen( "git pull",
                         cwd = os.path.dirname('E:/ads_covid-19/IDS_covid-19/data/raw/COVID-19/'),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():
    data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')
    json_object = json.loads(data.content)
    full_list = []
    for pos, each_dir in enumerate(json_object['features'][:]):
        full_list.append(each_dir['attributes'])

    pd_full_list = pd.DataFrame(full_list)
    pd_full_list.to_csv('E:/ads_covid-19/IDS_covid-19/data/raw/NPGEO/GER_state_data.csv' , sep=';')
    print('Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()


# 2 Process Pipeline 

In [None]:
# %load E:/ads_covid-19/IDS_covid-19/src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime

def store_relational_JH_data():
    data_path = "E:/ads_covid-19/IDS_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

    pd_raw = pd.read_csv(data_path)

    pd_data_base = pd_raw.rename(columns = {'Country/Region' : 'country', 'Province/State' : 'state'})

    pd_data_base['state'] = pd_data_base['state'].fillna('no')

    pd_data_base = pd_data_base.drop(['Lat', 'Long'], axis = 1)

    pd_relational_model = pd_data_base.set_index(['state', 'country']) \
                                    .T                                 \
                                    .stack(level = [0,1])              \
                                    .reset_index()                     \
                                    .rename(columns = {'level_0':'date', 0:'confirmed'})
    pd_relational_model['date'] = pd_relational_model['date'].astype('datetime64[ns]')
    pd_relational_model.to_csv('E:/ads_covid-19/IDS_covid-19/data/processed/COVID_relational_confirmed.csv', sep = ';', index = None)
    print('Number of raws stored: '+str(pd_relational_model.shape[0]))

if __name__ == '__main__':
    store_relational_JH_data()


# 3 Filter and Doubling Rate Calculation

In [None]:
# %load E:/ads_covid-19/IDS_covid-19/src/features/build_features.py
import numpy as np
import pandas as pd
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept = True)
from scipy import signal

def get_doubling_time_via_regression(in_array):

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1,1)

    assert len(in_array) == 3
    reg.fit(X, y)
    intercept = reg.intercept_
    slope = reg.coef_

    return intercept/slope

def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function
        it ensures that the data structure is kept'''
    window=5,
    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

def calc_filtered_data(df_input, filter_on = 'confirmed'):
    '''
        Calculates savgol filter and returns merged DataFrame
    '''
    must_contain = set(['state', 'country', filter_on])
    assert must_contain.issubset(set(df_input.columns))

    pd_filtered_result = df_input[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter).reset_index()
    df_output = pd.merge(df_input, pd_filtered_result[['index', filter_on +'_filtered']], on = ['index'], how = 'left')

    return df_output

def calc_doubling_rate(df_input, filter_on = 'confirmed'):
    '''
        Calculates approximated doubling rate and returns merged DataFrame
    '''
    must_contain = set(['state', 'country', filter_on])
    assert must_contain.issubset(set(df_input.columns))

    pd_DR_result = df_input[['state','country', filter_on]].groupby(['state','country']).apply(rolling_reg, filter_on).reset_index()
    pd_DR_result = pd_DR_result.rename(columns = {'level_2':'index', filter_on : filter_on+'_DR'})

    df_output = pd.merge(df_input, pd_DR_result[['index', filter_on+'_DR']], on = ['index'], how = 'left')
    return df_output


if __name__ == '__main__':
    test_data = np.array([2,4,6])
    result = get_doubling_time_via_regression(test_data)
    print('The test slope is: '+str(result))

    pd_JH_data=pd.read_csv('E:/ads_covid-19/IDS_covid-19/data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data = pd_JH_data.sort_values('date',ascending=True).reset_index().copy()

    pd_result_large = calc_filtered_data(pd_JH_data)
    pd_result_large = calc_doubling_rate(pd_result_large)
    pd_result_large = calc_doubling_rate(pd_result_large, 'confirmed_filtered')
    print(pd_result_large.head())


# 4 Visual Board

In [None]:
# %load E:/ads_covid-19/IDS_covid-19/src/visualization/visualize.py
import numpy as np
import pandas as pd
import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('E:/ads_covid-19/IDS_covid-19/data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 Data - Part 1

    * Goal of the project is to learn data science by applying a cross-industry standard process.\
        The default layout contains the confirmed infected cases in the log-scale format on the Y-axis\
        and Timeline in Days on the X-axis.

    ### The first dropdown menu enables selection of one or multiple  countries for visualization.
    ### The seconds dropdown menu contains four option:
        1. The ‘Timeline Confirmed’ represents confirmed infected cases along the timeline.
        2. The ‘Timeline Confirmed Filtered’ represents filtered(savgol filter) confirmed infected cases along the timeline.
        3. The ‘Timeline Doubling Rate’ represents the doubling rate of the infected cases along the timeline.
        4. The ‘Timeline Doubling Rate Filtered’ represents the doubling rate of the filtered(savgol filter) infected cases along the timeline.

    '''),

    dcc.Markdown('''
    ### Select country below:
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ### Select timeline of confirmed COVID-19 cases or the approximated doubling time:
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


# 5 SIR Calculation

In [None]:
# %load E:/ads_covid-19/IDS_covid-19/src/models/SIR.py
import pandas as pd
import numpy as np
from datetime import datetime
from scipy import optimize
from scipy import integrate

def SIR_model_t(SIR, t, beta, gamma):
    S,I,R=SIR
    dS_dt = -beta*I*S/N0
    dI_dt = beta*I*S/N0 - gamma*I
    dR_dt = gamma*I

    return dS_dt, dI_dt, dR_dt

def fit_odeint(x, beta, gamma):
    return integrate.odeint(SIR_model_t, (S0, I0, R0), x, args=(beta, gamma))[:,1]


if __name__ == '__main__':

    df_analyse=pd.read_csv('E:/ads_covid-19/IDS_covid-19/data/raw/COVID-19/csse_covid_19_data/SIR_raw.csv',sep=';')

    df_analyse.sort_values('Date',ascending=True)
    N0 = 1000000
    df_data = df_analyse[35:] ## We will consider data from 35th day, which is 26th Feb 2020
    t = np.arange(df_data.shape[0])

    for country in df_data.columns[1:]:
        ydata = np.array(df_data[df_data[country]>0][country])
        t = np.arange(len(ydata))
        I0=ydata[0]
        S0=N0-I0
        R0=0
        popt=[0.4,0.1]
        fit_odeint(t, *popt)
        popt, pcov = optimize.curve_fit(fit_odeint, t, ydata, maxfev=5000)
        perr = np.sqrt(np.diag(pcov))
        fitted=fit_odeint(t, *popt)
        f_padded = np.concatenate((np.zeros(df_data.shape[0]-len(fitted)) ,fitted)) #to make dimentions equal
        df_data[country + '_fitted'] = f_padded
    df_data.to_csv("E:/ads_covid-19/IDS_covid-19/data/processed/SIR_calculated.csv", sep = ';', index=False)


# 6 SIR Visual Board

In [None]:
# %load E:/ads_covid-19/IDS_covid-19/src/visualization/SIR_visualize.py
import numpy as np
import pandas as pd
import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
import random
import plotly.graph_objects as go


df_dash = pd.read_csv('E:/ads_covid-19/IDS_covid-19/data/processed/SIR_calculated.csv',sep=';')

color_list = []

for i in range(int((df_dash.shape[1]-1)/2)):
    random_color = '#%02x%02x%02x' % (random.randint(0, 255),random.randint(0, 255), random.randint(0, 255))
    color_list.append(random_color)

colors = {
    'background': '#111111',
    'text': '#7FDBFF'
}

fig = go.Figure()
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = dash.Dash(external_stylesheets=external_stylesheets)

app.layout = html.Div(style={'backgroundColor': colors['background'],}, children = [

    dcc.Markdown('''
        #  Data Science Project @ TU_KL on COVID-19 Data - Part 2
        ''',
        style={
            "border":"2px silver solid",
            'textAlign': 'center',
            'color': colors['text']
    }),

    dcc.Markdown('''
        ##  Plot shows actual number of infected people and simulated number of infected people\
            derived from SIR model for different countries.
        ''',
        style = {
            "border":"2px silver solid",
            'backgroundColor': colors['background'],
            'position' : 'fixed',
            'left' : 7,
            'top' : 83,
            'width' : 500,
            'height' : 1161,
            'textAlign': 'left',
            'border':'2px silver solid',
            'color': colors['text']

    }),

    dcc.Markdown('''
        ### Select the country below:
        ''',
        style={
                'textAlign': 'left',
                'color': colors['text'],
                'position':'fixed',
                'top':350,
                'left': 7,
                'width' :500,
    }),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_dash.columns[1:187]],
        value=['Germany'], # Which is pre-selected
        multi=True,
        style={
                'position': 'fixed',
                'left' : 7,
                'top' : 425,
                'textAlign': 'left',
                'color': colors['text'],
                'background-color': '#ededdf',
                'font-size' : 'large',
                'height': 100,
                'width': 500,
    }),


    dcc.Graph(
        figure=fig,
        id='SIR',
        style = {
            "border":"2px silver solid",
            'backgroundColor': colors['background'],
            'height' : 1161,
            'textAlign': 'center',
            'align' : 'right',
            'position' : 'fixed',
            'left' : 507,
            'top' : 83,
            'width' : '80%',
            'color': colors['text']

    })
])



@app.callback(
    Output('SIR', 'figure'),
    [Input('country_drop_down', 'value')])
def update_figure(country_list):


    traces = []
    for pos, each in enumerate(country_list):

        traces.append(dict(x=df_dash.Date,
                                y=df_dash[each],
                                mode='lines',
                                opacity=0.9,
                                name=each,
                                line = dict(color = color_list[pos])
                        )
                )
        traces.append(dict(x=df_dash.Date,
                                y=df_dash[each+'_fitted'],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each+'_simulated',
                                line = dict(color = color_list[pos])
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1800,
                height=1000,
                plot_bgcolor = colors['background'],
                paper_bgcolor = colors['background'],
                xaxis={'title':'Timeline',
                        'tickangle':-25,
                        'nticks':20,
                        'tickfont':dict(size=18,color=colors['text']),
                        'titlefont': dict(size=22, color=colors['text']),
                      },

                yaxis={'type':"log", 'title':'Number of infected people (log-scale)',
                       'tickfont':dict(size=18,color=colors['text']),
                       'titlefont': dict(size=22, color=colors['text'])
                      },
                title={'text': "Real and Simulated Number of Infected People",
                       'y':0.95,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top',
                       'font': dict(size=22, color=colors['text'])
                      }

            )

}

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)
