In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# read from local
df_covid = pd.read_csv("data/covid_clean_data.csv", sep=";", header=0, parse_dates = ['Date'])

In [19]:
# read from hdfs
from hdfs import InsecureClient

hdfs_client = InsecureClient('http://localhost:9870')

with hdfs_client.read('/project/covid_clean_data.csv', encoding = 'utf-8') as reader:
    df_covid = pd.read_csv(reader, sep=';', header=0, parse_dates = ['Date'])

In [24]:
df_covid

Unnamed: 0,Date,hospitalized,in intensive care,returning home,deceased
0,2020-03-18,2972,771,816,218
1,2020-03-19,4073,1002,1180,327
2,2020-03-20,5226,1297,1587,450
3,2020-03-21,5900,1453,1811,525
4,2020-03-22,6954,1674,2117,632
...,...,...,...,...,...
56,2020-05-13,21009,2385,58664,17082
57,2020-05-15,19801,2162,60439,17323
58,2020-05-16,19372,2091,61057,17393
59,2020-05-17,19302,2047,61204,17447


In [25]:
# read from local
df_bourse = pd.read_csv("data/cac40_clean_data.csv", sep=";", header=0, parse_dates = ['Date'])

In [17]:
# read from hdfs
with hdfs_client.read('/project/cac40_clean_data.csv', encoding = 'utf-8') as reader:
    df_bourse = pd.read_csv(reader, sep=';', header=0, parse_dates = ['Date'])

In [26]:
df_bourse

Unnamed: 0,Date,Adj Close
0,2020-02-03,5832.509766
1,2020-02-04,5935.049805
2,2020-02-05,5985.399902
3,2020-02-06,6038.180176
4,2020-02-07,6029.750000
...,...,...
67,2020-05-11,4490.220215
68,2020-05-12,4472.500000
69,2020-05-13,4344.950195
70,2020-05-14,4273.129883


In [27]:
# Join df_bourse and df_covid based on shared dates
df_merge = pd.merge(df_bourse, df_covid, on='Date')
df_merge

Unnamed: 0,Date,Adj Close,hospitalized,in intensive care,returning home,deceased
0,2020-03-18,3754.840088,2972,771,816,218
1,2020-03-19,3855.5,4073,1002,1180,327
2,2020-03-20,4048.800049,5226,1297,1587,450
3,2020-03-23,3914.310059,8673,2080,2567,860
4,2020-03-24,4242.700195,10176,2516,3281,1100
5,2020-03-25,4432.299805,12072,2935,4085,1388
6,2020-03-26,4543.580078,13879,3351,4947,1696
7,2020-03-27,4351.490234,15701,3758,5698,1995
8,2020-03-30,4378.509766,20946,5056,7923,3024
9,2020-03-31,4396.120117,22672,5496,9443,3523


In [None]:
# over the covid 61 rows and bourse 72 rows, 
# only 38 shared the same time points

In [7]:
df_merge.info()
# no null values found

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39 entries, 0 to 38
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               39 non-null     datetime64[ns]
 1   Adj Close          39 non-null     float64       
 2   hospitalized       39 non-null     int64         
 3   in intensive care  39 non-null     int64         
 4   returning home     39 non-null     int64         
 5   deceased           39 non-null     int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 2.1 KB


In [36]:
# write df to local
df_merge.to_csv('data/joined_cac40_covid_data.csv', sep=';', encoding='utf-8', index=False)

In [28]:
# write df to hdfs
with hdfs_client.write('/project/joined_cac40_covid_data.csv', encoding = 'utf-8') as writer:
    df_merge.to_csv(writer, sep=';', index=False)

In [31]:
# test read file saved to hdfs
with hdfs_client.read('/project/joined_cac40_covid_data.csv', encoding = 'utf-8') as reader:
    df_join = pd.read_csv(reader, sep=';', header=0, parse_dates = ['Date'])

In [32]:
df_join

Unnamed: 0,Date,Adj Close,hospitalized,in intensive care,returning home,deceased
0,2020-03-18,3754.840088,2972,771,816,218
1,2020-03-19,3855.5,4073,1002,1180,327
2,2020-03-20,4048.800049,5226,1297,1587,450
3,2020-03-23,3914.310059,8673,2080,2567,860
4,2020-03-24,4242.700195,10176,2516,3281,1100
5,2020-03-25,4432.299805,12072,2935,4085,1388
6,2020-03-26,4543.580078,13879,3351,4947,1696
7,2020-03-27,4351.490234,15701,3758,5698,1995
8,2020-03-30,4378.509766,20946,5056,7923,3024
9,2020-03-31,4396.120117,22672,5496,9443,3523
