In [164]:
# Set up modules for Google functionality
from google.cloud import bigquery # To run BQ statements
from google_auth_oauthlib import flow # To authorise as user
from googleapiclient.discovery import build # To pull in from sheets, slides etc. API
from google.auth.transport.requests import Request
from google.cloud.bigquery import magics

# Display
import pprint

# Operating system stuff
import pickle
import os.path
import sys

# Data handling
import json
import requests
from pandas import read_csv
from pandas import datetime
import re

# Stats, models, datasheets
import pandas as pd
import pyreadstat
import math

# Visualisation
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib_venn # For venn diagrams
from pandas.plotting import autocorrelation_plot

# Network graphs
import networkx as nx


# Misc
from xlsxwriter.utility import xl_rowcol_to_cell # Used to create cell references
import itertools

sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/') # Return logged-in credentials


#sys.path.append(r'/Users/stepwate/Python Codes/Reusable Code')

Tableau resources from John Hopkins available here:
https://www.tableau.com/covid-19-coronavirus-data-resources

Specific Google Sheet holding info is here:
https://docs.google.com/spreadsheets/d/14quQPFErG-hlpsrNgYcX85vW7JMMK5X2vNZrafRcH8c/edit#gid=1815215449

In [189]:

bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds) #Apply credentials to BQ client "bq"

magics.context.credentials = creds  #apply these credentials to the BQ magic syntax too


In [190]:
# Import a Google Sheet
values,confirmed_df=gaf.read_google_sheets_as_rows('14quQPFErG-hlpsrNgYcX85vW7JMMK5X2vNZrafRcH8c','COVID-19 Confirmed',creds,header_row=0)
values,deaths_df=gaf.read_google_sheets_as_rows('14quQPFErG-hlpsrNgYcX85vW7JMMK5X2vNZrafRcH8c','COVID-19 Deaths',creds,header_row=0)


In [191]:
deaths_df

In [168]:
confirmed_df[confirmed_df['Country_Region']=='United Kingdom'].Province_State.unique

In [170]:
client = bigquery.Client(project="itv-bde-analytics-dev")

In [171]:

table_ref = client.dataset('britbox_sandbox').table("sw_covid_confirmed")
job = client.load_table_from_dataframe(confirmed_df, table_ref, location="EU")

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

In [172]:

table_ref = client.dataset('britbox_sandbox').table("sw_covid_deaths")
job = client.load_table_from_dataframe(deaths_df, table_ref, location="EU")

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

In [173]:
%%bigquery df
SELECT Province_State, SUM(cast(Difference as int64)) as count
FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_confirmed`
where 
Country_Region='United Kingdom'
GROUP BY 1
ORDER BY count DESC


In [174]:
%%bigquery df
create or replace table `itv-bde-analytics-dev.britbox_sandbox.sw_covid_uk`
as select
coalesce(cases.date_value,deaths.date_value) as date_value,
ifnull(new_daily_cases,0) as new_daily_cases,
ifnull(cumulative_cases,0) as cumulative_cases,
ifnull(new_daily_deaths,0) as new_daily_deaths,
ifnull(cumulative_deaths,0) as cumulative_deaths
from
(SELECT cast(Cases as int64) as cumulative_cases,
cast(Difference as int64) as new_daily_cases,
parse_date("%m/%d/%Y",date) as date_value
FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_confirmed`
where Country_Region='United Kingdom' and Province_State='N/A'
) cases
full join
(SELECT cast(Cases as int64) as cumulative_deaths,
cast(Difference as int64) as new_daily_deaths,
parse_date("%m/%d/%Y",date) as date_value
FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_deaths`
where Country_Region='United Kingdom' and Province_State='N/A'
) deaths
on cases.date_value=deaths.date_value



In [183]:
query = """
    SELECT date_value, new_daily_deaths,new_daily_cases
    FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_uk`
"""
query_job = client.query(
        query
    )  # API request - starts the query

df = query_job.to_dataframe()

In [188]:
df.set_index('date_value').plot()

## Correlation between daily deaths and the number of daily new cases N days ago

In [175]:
death_lag_corr=[]
for i in range(0,50):
    query = """
    SELECT new_daily_deaths, lag(new_daily_cases,{}) over (order by date_value) as lagged_cases
    FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_uk`
    where new_daily_deaths>0
""".format(i)
    query_job = client.query(
        query
    )  # API request - starts the query

    df = query_job.to_dataframe()
    death_lag_corr.append({'lag':i,'corr':df.corr()['new_daily_deaths'].loc['lagged_cases']})

In [176]:
correlation_df=pd.DataFrame(death_lag_corr)

correlation_df.plot('corr')

In [177]:
correlation_df[correlation_df['corr']==correlation_df['corr'].max()]

## Correlation between daily cumulative deaths and the number of cumulative cases N days ago

In [80]:
death_lag_corr2=[]
for i in range(0,50):
    query = """
    SELECT cumulative_deaths, lag(cumulative_cases,{}) over (order by date_value) as lagged_cases
    FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_uk`
    where cumulative_deaths>0
""".format(i)
    query_job = client.query(
        query
    )  # API request - starts the query

    df = query_job.to_dataframe()
    death_lag_corr2.append({'lag':i,'corr':df.corr()['cumulative_deaths'].loc['lagged_cases']})


In [81]:
correlation_df_cum=pd.DataFrame(death_lag_corr2)
correlation_df_cum.plot('corr')
correlation_df_cum[correlation_df_cum['corr']==correlation_df_cum['corr'].max()]

In [150]:
query = """
    SELECT *
    , lag(new_daily_cases,6) over (order by date_value) as lag6_cases
    , lag(cumulative_cases,6) over (order by date_value) as lag6_cases_cum
    FROM `itv-bde-analytics-dev.britbox_sandbox.sw_covid_uk`
""".format(i)
query_job = client.query(
        query
    )  # API request - starts the query

full_covid_df = query_job.to_dataframe()

In [151]:
full_covid_df.head()

In [154]:

import statsmodels.api as sm

X = full_covid_df["lag6_cases_cum"][6:]
y = full_covid_df["cumulative_deaths"][6:]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

In [155]:
full_covid_df['modelled_deaths']=0.1741*full_covid_df['lag6_cases_cum']

In [156]:
full_covid_df[['cumulative_deaths','modelled_deaths']].plot()

In [129]:
pd.DataFrame(full_covid_df['cumulative_deaths']/full_covid_df['modelled_deaths']).plot()

## pd.DataFrame(np.log(full_covid_df['cumulative_deaths'][7:]),np.log(full_covid_df['modelled_deaths'][7:])).plot()

In [92]:
full_covid_df[-6:]

In [93]:
0.2081*(88621-55242)

In [94]:
0.2081*(88621-55242)+11329

In [95]:
X = full_covid_df["lag6_cases"][6:]
y = full_covid_df["new_daily_deaths"][6:]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

In [96]:
full_covid_df['modelled_deaths_daily']=0.2000*full_covid_df['lag6_cases']

In [118]:
full_covid_df[['new_daily_deaths','modelled_deaths_daily']].plot()

In [123]:
from datetime import timedelta
full_covid_df['future_date']= full_covid_df["date_value"]+ timedelta(days=6)
full_covid_df['future_deaths_daily']= 0.2000*full_covid_df['new_daily_cases']
full_covid_df['future_deaths_cumulative']= 0.2081*full_covid_df['cumulative_cases']
full_covid_df[-10:]

In [124]:
full_covid_df.set_index('date_value')['cumulative_cases'].plot()

In [108]:
full_covid_df.set_index('date_value')['new_daily_cases'].plot()

In [110]:
import scipy
from sklearn.preprocessing import StandardScaler
import scipy.stats

y=full_covid_df['cumulative_cases'].to_numpy()
sc=StandardScaler() 
yy = y.reshape (-1,1)
sc.fit(yy)
y_std =sc.transform(yy)
y_std = y_std.flatten()
y_std
del yy

In [112]:
dist = getattr(scipy.stats, 'expon')
param = dist.fit(y_std)
    

In [115]:
import numpy as np

# Set list of distributions to test
# See https://docs.scipy.org/doc/scipy/reference/stats.html for more

# Set up list of candidate distributions to use
# See https://docs.scipy.org/doc/scipy/reference/stats.html for more

dist_names = ['beta',
              'expon',
              'gamma',
              'lognorm',
              'norm',
              'pearson3',
              'triang',
              'uniform',
              'weibull_min', 
              'weibull_max']

# Set up empty lists to store results
p_values = []


# Loop through candidate distributions

for distribution in dist_names:
    # Set up distribution and get fitted distribution parameters
    dist = getattr(scipy.stats, distribution)
    param = dist.fit(y_std)
    
    # Obtain the KS test P statistic, round it to 5 decimal places
    p = scipy.stats.kstest(y_std, distribution, args=param)[1]
    p = np.around(p, 5)
    p_values.append(p)    
    
    
# Collate results and sort by goodness of fit (best at top)

results = pd.DataFrame()
results['Distribution'] = dist_names
results['p_value'] = p_values

    
# Report results

print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)

In [117]:
x = np.arange(len(y))
size = len(y)
# Set list of distributions to test
# See https://docs.scipy.org/doc/scipy/reference/stats.html for more

# Turn off code warnings (this is not recommended for routine use)


# Set up list of candidate distributions to use
# See https://docs.scipy.org/doc/scipy/reference/stats.html for more

dist_names = ['beta',
              'expon',
              'gamma',
              'lognorm',
              'norm',
              'pearson3',
              'triang',
              'uniform',
              'weibull_min', 
              'weibull_max']

# Set up empty lists to stroe results
chi_square = []
p_values = []

# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,51)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)

# Loop through candidate distributions

for distribution in dist_names:
    # Set up distribution and get fitted distribution parameters
    dist = getattr(scipy.stats, distribution)
    param = dist.fit(y_std)
    
    # Obtain the KS test P statistic, round it to 5 decimal places
    p = scipy.stats.kstest(y_std, distribution, args=param)[1]
    p = np.around(p, 5)
    p_values.append(p)    
    
    # Get expected counts in percentile bins
    # This is based on a 'cumulative distrubution function' (cdf)
    cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2], 
                          scale=param[-1])
    expected_frequency = []
    for bin in range(len(percentile_bins)-1):
        expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
        expected_frequency.append(expected_cdf_area)
    
    # calculate chi-squared
    expected_frequency = np.array(expected_frequency) * size
    cum_expected_frequency = np.cumsum(expected_frequency)
    ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
    chi_square.append(ss)
        
# Collate results and sort by goodness of fit (best at top)

results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results.sort_values(['chi_square'], inplace=True)
    
# Report results

print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)