In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
import requests
from pandas_datareader import data

  from pandas.util.testing import assert_frame_equal


In [5]:
csv = '../Gareth Hughes/owid-covid-data.csv'

In [6]:
df = pd.read_csv(csv)

## PANDAS DataAnalysis with WHO COVID19 Data

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
pd.to_datetime(df['date'])

In [None]:
df['iso_code'].unique()

In [None]:
df['iso_code'].unique().size

In [None]:
df['location'].unique()

In [None]:
# Earliest reported date in the database.
first_reported = df[df['date'] == df['date'].min()]

first_reported[first_reported['total_cases'] > 0]

In [None]:
first_reported.location.unique().size

In [None]:
latest_report = df[df['date'] == df['date'].max()]
print(latest_report.location.unique().size)
print(df['date'].min())
print(df['date'].max())

In [None]:
#df.isna
#df = df.fillna(0)

In [None]:
df['total_tests_per_thousand'].mode()

### Tests per thousand, best country.

In [10]:
idx = df['total_tests_per_thousand'] == df['total_tests_per_thousand'].max()
df[['location','total_tests_per_thousand']][idx]

Unnamed: 0,location,total_tests_per_thousand
5470,Iceland,111.308


### Highest new deaths.

In [11]:
idx = df['new_deaths'] == df['new_deaths'].max()
df[['location','new_deaths']][idx]

Unnamed: 0,location,new_deaths
11001,United States,4928


### Highest total deaths.

In [12]:
idx = df['total_deaths'] == df['total_deaths'].max()
df[['location','total_deaths']][idx]

Unnamed: 0,location,total_deaths
11002,United States,33284


### Highest total cases.

In [13]:
idx = df['total_cases'] == df['total_cases'].max()
df[['location','total_cases']][idx]

Unnamed: 0,location,total_cases
11002,United States,671331


## Subsetting based on a specific date to get monthly updates.

In [None]:
april = df[df.date == '2020-04-17']

In [None]:
april_tc = april[['date','location','total_cases','total_deaths','total_tests']]
april_tc
april_tc.columns

In [None]:
march = df[df.date == '2020-03-31']

In [None]:
feb = df[df.date == '2020-02-29']

In [None]:
jan = df[df.date == '2020-01-31']

In [None]:
first_reported = df[df.date == '2019-12-31']

In [None]:
march_tc = march[['date','location','total_cases','total_deaths','total_tests']]

### Month_tc = Dataframes whereby Total_Cases, Total_Deaths and Total_Tests are subset based from a specific date.

In [None]:
# here are the total number of cases as March 31st 2020.
march_tc

In [None]:
# here are the total number of cases as Feb 29th 2020.
feb_tc = feb[['date','location','total_cases','total_deaths','total_tests']]

In [None]:
feb_tc

In [None]:
jan_tc = jan[['date','location','total_cases','total_deaths','total_tests']]
jan_tc

In [None]:
first_reported_tc = first_reported[['date','location','total_cases','total_deaths','total_tests']]
first_reported_tc

In [None]:
first_reported_tc[first_reported_tc['total_cases'] > 0 ]

### Subsetting of Spain from overall Dataframe, useful for specific market analysis?

In [None]:
spain = df[df['location'] == 'Spain']

In [None]:
spain

## Subsetting Spain dates into one DataFrame

In [None]:
spain_jan = spain[spain['date'] == '2020-01-31']
spain_feb = spain[spain['date'] == '2020-02-29']
spain_march = spain[spain['date'] == '2020-03-31']
spain_april = spain[spain['date'] == '2020-04-15']

In [None]:
spain_sets = spain[(spain['date'] == '2020-01-31')
                   & (spain['date'] == '2020-02-28')
                  & (spain['date'] == '2020-03-31')
                   & (spain['date'] == '2020-04-15')]
            

In [None]:
spain_sets

In [None]:
sort = df.sort_values(by = 'total_cases', ascending = False)

In [None]:
april_top10 = sort[sort['date'] == '2020-04-17'].head(10)

## COVID19 Plots with Time

In [None]:
april_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-04-17")
plt.xlabel("Location")
plt.ylabel("Cases")
plt.gcf().set_size_inches(20,10)
plt.savefig('2020-04-17.png', bbox_inches='tight')
plt.show()

In [None]:
earlyapril_top10 = sort[sort['date'] == '2020-04-08'].head(10)

In [None]:
earlyapril_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-04-08")
plt.xlabel("Location")
plt.ylabel("Cases")
plt.gcf().set_size_inches(20,10)
plt.savefig('2020-04-08.png', bbox_inches='tight')
plt.show()

In [None]:
endmarch_top10 = sort[sort['date'] == '2020-03-31'].head(10)

In [None]:
endmarch_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-03-31")
plt.xlabel("Location")
plt.ylabel("Cases")
plt.gcf().set_size_inches(20,10)
plt.savefig('2020-03-31.png', bbox_inches='tight')
plt.show()

In [None]:
midmarch_top10 = sort[sort['date'] == '2020-03-16'].head(10)

In [None]:
midmarch_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-03-16")
plt.xlabel("Location")
plt.ylabel("Cases")
plt.gcf().set_size_inches(20,10)
plt.savefig('2020-03-16.png', bbox_inches='tight')
plt.show()

In [None]:
startmarch_top10 = sort[sort['date'] == '2020-03-08'].head(10)

In [None]:
startmarch_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-03-08")
plt.xlabel("Location")
plt.ylabel("Cases")
plt.gcf().set_size_inches(20,10)
plt.savefig('2020-03-08.png', bbox_inches='tight')
plt.show()

In [None]:
feb_top10 = sort[sort['date'] == '2020-02-29'].head(10)

In [None]:
feb_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-02-29")
plt.xlabel("Location")
plt.ylabel("Cases")
plt.gcf().set_size_inches(20,10)
plt.savefig('2020-02-29.png', bbox_inches='tight')
plt.show()

In [None]:
jan_top10 = sort[sort['date'] == '2020-01-30'].head(3)
jan_top10.plot(x = 'location', y = 'total_cases', kind = 'bar', legend = None)
plt.gca().invert_xaxis()
plt.title("Total Coronavirus Cases as of 2020-01-30")
plt.ylabel("Cases")
plt.gcf().set_size_inches(10,5)
plt.show()

### Testing of Stockmarket Module of Pandas

In [None]:
start_date = '2020-02-01'
end_date = '2020-04-17'
panel_data = data.DataReader('^IBEX', 'yahoo', start_date, end_date)
panel_data.shape

In [None]:
panel_data.head(10)

In [None]:
panel_data.tail(10)

In [None]:
# subset Spain total cases against market!
spaTotCases = df[['date','total_cases']][df['location'] == 'Spain']
spaTotCases = spaTotCases[spaTotCases.total_cases > 0].reset_index()
print(spaTotCases.shape)
spaTotCases.head()

In [None]:
# stock prices doesn't have w-ends!
#pd.to_datetime(spaTotCases.date[2]) == panel_data.index[0]
boolDate  = pd.to_datetime(spaTotCases.date) == panel_data.index[0]
for date in panel_data.index:
    boolDate = boolDate | (pd.to_datetime(spaTotCases.date) == date)

In [None]:
# initialize stock column with NaN
stockMean = (panel_data.High + panel_data.Low)/2
spaTotCases['stock'] = stockMean
# then point properly
spaTotCases['stock'].loc[boolDate] = stockMean.values

In [None]:
spaTotCases.head(10)

In [None]:
spaTotCases.tail(10)

In [None]:
spaTotCases.to_csv('spainTotCaseStock.csv')

## Italy and its market (FTSE MIB)

In [6]:
# subset Italy total cases against market!
itaTotCases = df[['date','total_cases']][df['location'] == 'Italy']
itaTotCases = itaTotCases[itaTotCases.total_cases > 0].reset_index()
print(itaTotCases.shape)
itaTotCases.head()

(78, 3)


Unnamed: 0,index,date,total_cases
0,5612,2020-01-31,3
1,5613,2020-02-01,3
2,5614,2020-02-02,3
3,5615,2020-02-03,3
4,5616,2020-02-04,3


In [31]:
start_date = '2020-01-31'
end_date = '2020-04-17'
panel_data = data.DataReader('IMIB.MI', 'yahoo', start_date, end_date)
panel_data.shape
#ETFMIB.MI
# CSMIB.MI
# LEVMIB.MI
# IMIB.MI
#FTSEMIB.MI
#ITLMS.MI
#FTSEMIBN.MI

(47, 6)

In [37]:
panel_data.tail(10)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-24,10.264,9.743,9.865,10.264,164563.0,10.264
2020-03-25,10.662,9.958,10.406,10.276,173912.0,10.276
2020-03-26,10.44,10.076,10.17,10.44,101973.0,10.44
2020-03-27,10.296,9.99,10.244,10.092,125078.0,10.092
2020-03-30,10.112,9.798,10.092,10.11,168518.0,10.11
2020-03-31,10.358,10.0,10.21,10.196,224916.0,10.196
2020-04-01,10.06,9.914,9.998,9.914,93261.0,9.914
2020-04-02,10.064,9.88,10.056,10.038,45493.0,10.038
2020-04-03,10.03,9.87,10.03,9.878,73858.0,9.878
2020-04-06,10.214,10.08,10.146,10.208,50438.0,10.208


In [33]:
# stock prices doesn't have w-ends!
#pd.to_datetime(itaTotCases.date[2]) == panel_data.index[0]
boolDate  = pd.to_datetime(itaTotCases.date) == panel_data.index[0]
for date in panel_data.index:
    boolDate = boolDate | (pd.to_datetime(itaTotCases.date) == date)

In [34]:
# initialize stock column with NaN
stockMean = (panel_data.High + panel_data.Low)/2
itaTotCases['stock'] = stockMean
# then point properly
itaTotCases['stock'].loc[boolDate] = stockMean.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [39]:
itaTotCases.tail(20)

Unnamed: 0,index,date,total_cases,stock
58,5670,2020-03-29,92472,
59,5671,2020-03-30,97689,9.955
60,5672,2020-03-31,101739,10.179
61,5673,2020-04-01,105792,9.987
62,5674,2020-04-02,110574,9.972
63,5675,2020-04-03,115242,9.95
64,5676,2020-04-04,119827,
65,5677,2020-04-05,124632,
66,5678,2020-04-06,128948,10.147
67,5679,2020-04-07,132547,


In [40]:
itaTotCases.to_csv('italyTotCaseStock.csv')

## Germany and its market (DAX 30)

In [43]:
# subset Germany total cases against market!
gerTotCases = df[['date','total_cases']][df['location'] == 'Germany']
gerTotCases = gerTotCases[gerTotCases.total_cases > 0].reset_index()
print(gerTotCases.shape)
gerTotCases.head()

(81, 3)


Unnamed: 0,index,date,total_cases
0,2634,2020-01-28,1
1,2635,2020-01-29,4
2,2636,2020-01-30,4
3,2637,2020-01-31,5
4,2638,2020-02-01,7


In [45]:
start_date = '2020-01-28'
end_date = '2020-04-17'
panel_data = data.DataReader('^GDAXI', 'yahoo', start_date, end_date)
panel_data.shape
# 3DEL.DE

(57, 6)

In [51]:
panel_data.tail(10)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-02,9650.269531,9337.019531,9587.450195,9570.820312,144225800,9570.820312
2020-04-03,9626.719727,9470.200195,9535.269531,9525.769531,116245300,9525.769531
2020-04-06,10097.30957,9841.490234,9889.030273,10075.169922,126692200,10075.169922
2020-04-07,10590.410156,10225.019531,10464.110352,10356.700195,149513100,10356.700195
2020-04-08,10340.790039,10198.219727,10301.55957,10332.889648,102430700,10332.889648
2020-04-09,10649.790039,10311.700195,10490.650391,10564.740234,134477100,10564.740234
2020-04-14,10820.169922,10658.959961,10733.969727,10696.55957,106043700,10696.55957
2020-04-15,10678.19043,10243.110352,10678.19043,10279.759766,133088500,10279.759766
2020-04-16,10438.370117,10236.240234,10420.259766,10301.540039,106131500,10301.540039
2020-04-17,10756.780273,10542.709961,10607.349609,10625.780273,143341400,10625.780273


In [47]:
# stock prices doesn't have w-ends!
#pd.to_datetime(gerTotCases.date[2]) == panel_data.index[0]
boolDate  = pd.to_datetime(gerTotCases.date) == panel_data.index[0]
for date in panel_data.index:
    boolDate = boolDate | (pd.to_datetime(gerTotCases.date) == date)

In [48]:
# initialize stock column with NaN
stockMean = (panel_data.High + panel_data.Low)/2
gerTotCases['stock'] = stockMean
# then point properly
gerTotCases['stock'].loc[boolDate] = stockMean.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [52]:
gerTotCases.tail(20)

Unnamed: 0,index,date,total_cases,stock
61,2695,2020-03-29,52547,
62,2696,2020-03-30,57298,9635.649902
63,2697,2020-03-31,61913,9900.149902
64,2698,2020-04-01,67366,9592.254883
65,2699,2020-04-02,73522,9493.644531
66,2700,2020-04-03,79696,9548.459961
67,2701,2020-04-04,85778,
68,2702,2020-04-05,91714,
69,2703,2020-04-06,95391,9969.399902
70,2704,2020-04-07,99225,10407.714844


In [53]:
gerTotCases.to_csv('germanyTotCaseStock.csv')

## US covid and stock market

In [56]:
# subset US total cases against market!
usTotCases = df[['date','total_cases']][df['location'] == 'United States']
usTotCases = usTotCases[usTotCases.total_cases > 0].reset_index()
print(usTotCases.shape)
usTotCases.tail()

(88, 3)


Unnamed: 0,index,date,total_cases
83,10998,2020-04-13,557571
84,10999,2020-04-14,582594
85,11000,2020-04-15,609516
86,11001,2020-04-16,639664
87,11002,2020-04-17,671331


In [3]:
start_date = '2020-01-21'
end_date = '2020-04-17'
panel_data = data.DataReader('^DJI', 'yahoo', start_date, end_date)
panel_data.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-21,29341.210938,29146.470703,29269.050781,29196.039062,320640000,29196.039062
2020-01-22,29320.199219,29172.259766,29263.630859,29186.269531,283440000,29186.269531
2020-01-23,29190.470703,28966.980469,29111.019531,29160.089844,307060000,29160.089844
2020-01-24,29288.789062,28843.310547,29230.390625,28989.730469,380010000,28989.730469
2020-01-27,28671.789062,28440.470703,28542.490234,28535.800781,337270000,28535.800781


In [62]:
panel_data.tail(10)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-03,21447.810547,20863.089844,21285.929688,21052.529297,450010000,21052.529297
2020-04-06,22783.449219,21693.630859,21693.630859,22679.990234,610760000,22679.990234
2020-04-07,23617.240234,22634.449219,23537.439453,22653.859375,594660000,22653.859375
2020-04-08,23513.400391,22682.990234,22893.470703,23433.570312,472740000,23433.570312
2020-04-09,24008.990234,23504.089844,23690.660156,23719.369141,566400000,23719.369141
2020-04-13,23698.929688,23095.349609,23698.929688,23390.769531,394320000,23390.769531
2020-04-14,24040.580078,23683.439453,23690.570312,23949.759766,485910000,23949.759766
2020-04-15,23649.720703,23233.320312,23600.720703,23504.349609,437630000,23504.349609
2020-04-16,23598.080078,23211.380859,23543.660156,23537.679688,471050000,23537.679688
2020-04-17,24264.210938,23817.150391,23817.150391,24242.490234,525950000,24242.490234


In [59]:
# stock prices doesn't have w-ends!
#pd.to_datetime(usTotCases.date[2]) == panel_data.index[0]
boolDate  = pd.to_datetime(usTotCases.date) == panel_data.index[0]
for date in panel_data.index:
    boolDate = boolDate | (pd.to_datetime(usTotCases.date) == date)

In [60]:
# initialize stock column with NaN
stockMean = (panel_data.High + panel_data.Low)/2
usTotCases['stock'] = stockMean
# then point properly
usTotCases['stock'].loc[boolDate] = stockMean.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [63]:
usTotCases.tail(20)

Unnamed: 0,index,date,total_cases,stock
68,10983,2020-03-29,124665,
69,10984,2020-03-30,143025,21950.084961
70,10985,2020-03-31,164620,22166.224609
71,10986,2020-04-01,189618,21135.834961
72,10987,2020-04-02,216721,21106.394531
73,10988,2020-04-03,245540,21155.450195
74,10989,2020-04-04,277965,
75,10990,2020-04-05,312237,
76,10991,2020-04-06,337635,22238.540039
77,10992,2020-04-07,368196,23125.844727


In [64]:
usTotCases.to_csv('usTotCaseStock.csv')

## china covid and market

In [67]:
# subset china total cases against market!
chinaTotCases = df[['date','total_cases']][df['location'] == 'China']
chinaTotCases = chinaTotCases[chinaTotCases.total_cases > 0].reset_index()
print(chinaTotCases.shape)
chinaTotCases.head()

(109, 3)


Unnamed: 0,index,date,total_cases
0,2014,2019-12-31,27
1,2015,2020-01-01,27
2,2016,2020-01-02,27
3,2017,2020-01-03,44
4,2018,2020-01-04,44


In [76]:
start_date = '2019-12-31'
end_date = '2020-04-17'
panel_data = data.DataReader('000001.SS', 'yahoo', start_date, end_date)
panel_data.shape
# ^SSEC

(71, 6)

In [83]:
panel_data.head(10)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-31,3051.677002,3030.511963,3036.385986,3050.124023,217400,3050.124023
2020-01-02,3098.100098,3066.335938,3066.335938,3085.197998,292500,3085.197998
2020-01-03,3093.819092,3074.518066,3089.021973,3083.785889,261500,3083.785889
2020-01-06,3107.202881,3065.309082,3070.908936,3083.407959,312600,3083.407959
2020-01-07,3105.450928,3084.329102,3085.488037,3104.802002,276600,3104.802002
2020-01-08,3094.239014,3059.131104,3094.239014,3066.893066,297900,3066.893066
2020-01-09,3097.329102,3080.131104,3082.639893,3094.88208,243400,3094.88208
2020-01-10,3105.225098,3081.395996,3102.293945,3092.291016,210400,3092.291016
2020-01-13,3115.570068,3075.384033,3091.49292,3115.570068,210600,3115.570068
2020-01-14,3127.168945,3105.60498,3120.666992,3106.820068,230000,3106.820068


In [78]:
# stock prices doesn't have w-ends!
#pd.to_datetime(chinaTotCases.date[2]) == panel_data.index[0]
boolDate  = pd.to_datetime(chinaTotCases.date) == panel_data.index[0]
for date in panel_data.index:
    boolDate = boolDate | (pd.to_datetime(chinaTotCases.date) == date)

In [79]:
# initialize stock column with NaN
stockMean = (panel_data.High + panel_data.Low)/2
chinaTotCases['stock'] = stockMean
# then point properly
chinaTotCases['stock'].loc[boolDate] = stockMean.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [82]:
chinaTotCases.head(20)

Unnamed: 0,index,date,total_cases,stock
0,2014,2019-12-31,27,3041.094482
1,2015,2020-01-01,27,
2,2016,2020-01-02,27,3082.218018
3,2017,2020-01-03,44,3084.168579
4,2018,2020-01-04,44,
5,2019,2020-01-05,59,
6,2020,2020-01-06,59,3086.255981
7,2021,2020-01-07,59,3094.890015
8,2022,2020-01-08,59,3076.685059
9,2023,2020-01-09,59,3088.730103


In [84]:
chinaTotCases.to_csv('chinaTotCaseStock.csv')