In [44]:
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr
from statsmodels.stats.multitest import multipletests

In [45]:
corona_df=pd.read_csv('../data/raw/corona/de_corona.csv',sep='\t')
with open('../data/raw/metadata/de_metadata.json','r', encoding="utf8") as f:
    country_metadata=json.load(f)

region_map = {country_metadata["country_metadata"][i]["covid_region_code"]: 
    country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}
corona_df["iso3166-2"] = corona_df["region_code"].map(region_map)

corona_df

Unnamed: 0,date,region_code,confirmed_addition,deceased_addition,iso3166-2
0,2020-01-02,Nordrhein-Westfalen,1,0,DE-NW
1,2020-01-07,Nordrhein-Westfalen,1,0,DE-NW
2,2020-01-09,Nordrhein-Westfalen,1,1,DE-NW
3,2020-01-12,Nordrhein-Westfalen,1,0,DE-NW
4,2020-01-14,Nordrhein-Westfalen,1,0,DE-NW
...,...,...,...,...,...
5597,2021-02-21,Rheinland-Pfalz,120,0,DE-RP
5598,2021-02-21,Sachsen,68,0,DE-SN
5599,2021-02-21,Sachsen-Anhalt,84,0,DE-ST
5600,2021-02-21,Schleswig-Holstein,113,0,DE-SH


In [46]:
weather_df = pd.read_csv("../data/raw/weather/weather.csv")

weather_df["TemperatureAboveGround"] = weather_df["TemperatureAboveGround"] - 273.15
weather_df = weather_df[weather_df["iso3166-2"].str.startswith("DE")]

weather_df

Unnamed: 0,date,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed
0,2020-02-13,DE-BB,76.337444,1.824290e+06,2.403341e+06,3.401573,0.003355,2.777806,4.542822
1,2020-02-13,DE-BE,76.065297,1.786373e+06,2.408182e+06,3.694633,0.003523,4.671329,4.761509
2,2020-02-13,DE-BW,80.113988,1.505760e+06,2.290158e+06,3.077143,0.008013,4.268546,4.467024
3,2020-02-13,DE-BY,81.554346,2.363013e+06,2.275361e+06,2.433053,0.005227,4.417797,3.677414
4,2020-02-13,DE-HB,87.167414,8.389756e+03,2.406940e+06,3.087452,0.007715,1.794872,4.699573
...,...,...,...,...,...,...,...,...,...
20182,2021-02-21,DE-SH,78.077610,5.698494e+06,2.431487e+06,8.793486,0.000000,4.474084,2.532838
20183,2021-02-21,DE-SL,60.483775,8.762859e+06,2.350492e+06,10.366066,0.000000,8.000000,3.247090
20184,2021-02-21,DE-SN,74.638170,6.687276e+06,2.363996e+06,6.127820,0.000008,5.409593,4.648033
20185,2021-02-21,DE-ST,69.587170,7.276057e+06,2.408002e+06,9.268775,0.000000,5.273630,2.972845


In [47]:
merged_df = corona_df.merge(weather_df)

merged_df

Unnamed: 0,date,region_code,confirmed_addition,deceased_addition,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed
0,2020-02-16,Mecklenburg-Vorpommern,1,0,DE-MV,69.818665,4.562424e+04,2.392858e+06,11.144049,0.004641,0.211216,7.888859
1,2020-02-17,Brandenburg,1,0,DE-BB,64.189273,9.007504e+05,2.409194e+06,9.162595,0.001005,2.631685,6.319781
2,2020-02-18,Brandenburg,1,0,DE-BB,71.904630,4.042940e+06,2.421421e+06,6.529468,0.002388,4.037911,6.150824
3,2020-02-20,Nordrhein-Westfalen,1,0,DE-NW,81.738535,1.064802e+06,2.383996e+06,6.513867,0.006802,3.524003,6.477330
4,2020-02-24,Baden-Württemberg,1,0,DE-BW,80.314208,6.604918e+05,2.308155e+06,8.744530,0.005284,5.613589,4.663207
...,...,...,...,...,...,...,...,...,...,...,...,...
5575,2021-02-21,Rheinland-Pfalz,120,0,DE-RP,65.642820,8.457394e+06,2.348996e+06,8.973657,0.000000,7.879810,3.009041
5576,2021-02-21,Sachsen,68,0,DE-SN,74.638170,6.687276e+06,2.363996e+06,6.127820,0.000008,5.409593,4.648033
5577,2021-02-21,Sachsen-Anhalt,84,0,DE-ST,69.587170,7.276057e+06,2.408002e+06,9.268775,0.000000,5.273630,2.972845
5578,2021-02-21,Schleswig-Holstein,113,0,DE-SH,78.077610,5.698494e+06,2.431487e+06,8.793486,0.000000,4.474084,2.532838


In [48]:
weather_lost = weather_df.shape[0]-merged_df.shape[0]
corona_lost = corona_df.shape[0]-merged_df.shape[0]
print(f"We lost {weather_lost} rows from the weather dataset.")
print(f"We lost {corona_lost} rows from the corona datset")
print(f"We lost {weather_lost+corona_lost} rows in total, because presumeably the one of the datasets does not have the same amount of dates and so some rows are dropped")


We lost 420 rows from the weather dataset.
We lost 22 rows from the corona datset
We lost 442 rows in total, because presumeably the one of the datasets does not have the same amount of dates and so some rows are dropped


In [49]:
Xs = list(merged_df.columns[6:12])

significance_threshold = 0.005

for var in Xs:
    corr, pvalue = pearsonr(merged_df["confirmed_addition"], merged_df[var])
    print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")


SolarRadiation
-0.346	4.9463537017058293e-157	True

Surfacepressure
-0.254	3.724268829474388e-83	True

TemperatureAboveGround
-0.364	2.2266597734583237e-174	True

Totalprecipitation
0.046	0.0005757858841694551	True

UVIndex
-0.424	6.229321949127303e-243	True

WindSpeed
-0.033	0.0125469030820965	False



In [50]:
for var in Xs:
    corr, pvalue = spearmanr(merged_df["confirmed_addition"], merged_df[var])
    print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

SolarRadiation
-0.516	0.0	True

Surfacepressure
-0.239	4.681330884969123e-73	True

TemperatureAboveGround
-0.578	0.0	True

Totalprecipitation
0.072	6.176316724575035e-08	True

UVIndex
-0.655	0.0	True

WindSpeed
-0.014	0.29525564389645925	False



In [51]:
for var in Xs:
    corr, pvalue = pearsonr(np.log(merged_df["confirmed_addition"] + 1), merged_df[var])
    print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

SolarRadiation
-0.469	1.403889609593433e-303	True

Surfacepressure
-0.251	1.0650823309103934e-80	True

TemperatureAboveGround
-0.550	0.0	True

Totalprecipitation
0.022	0.09897196360742531	False

UVIndex
-0.623	0.0	True

WindSpeed
-0.017	0.2149067812046524	False



In [56]:
pvalues = []
tests = ("Linear", "Spearman", "Log")

for var in Xs:
    corr, pvalue = pearsonr(merged_df["confirmed_addition"], merged_df[var])
    pvalues.append(pvalue)

for var in Xs:
    corr, pvalue = spearmanr(merged_df["confirmed_addition"], merged_df[var])
    pvalues.append(pvalue)

for var in Xs:
    corr, pvalue = pearsonr(np.log(merged_df["confirmed_addition"]), merged_df[var])
    pvalues.append(pvalue)


In [57]:
significant, pholmcorrected, _, _ = multipletests(pvalues, alpha = 0.005, method = "holm")

for i in range(len(tests)):
    for j in range(len(Xs)):
        print(f"{tests[i]}\t{Xs[j]}\t{significant[i + j]}")

Linear	SolarRadiation	True
Linear	Surfacepressure	True
Linear	TemperatureAboveGround	True
Linear	Totalprecipitation	True
Linear	UVIndex	True
Linear	WindSpeed	False
Spearman	SolarRadiation	True
Spearman	Surfacepressure	True
Spearman	TemperatureAboveGround	True
Spearman	Totalprecipitation	True
Spearman	UVIndex	False
Spearman	WindSpeed	True
Log	SolarRadiation	True
Log	Surfacepressure	True
Log	TemperatureAboveGround	True
Log	Totalprecipitation	False
Log	UVIndex	True
Log	WindSpeed	True
