In [2]:
import pandas as pd
import calendar
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import hvplot.pandas
import numpy as np
from pathlib import Path

# DATA IMPORT & CLEAN
Bringing in the 3 different disease summary statistics that we found, as well as the SPY, Gold, and Treasury indexes.

---

### DISEASE STATS FIRST

In [3]:
# Reading whale returns
ebola_path = Path('../Matthew_Richards/ebola_summ_stats_MSR.csv')
sars_path = Path('../Matthew_Richards/sars_summ_stats_MSR.csv')
zika_path = Path('../Matthew_Richards/zika_summ_stats_MSR.csv')

#READ IN DATA, DATE-TIME FORMAT.
ebola_df = pd.read_csv(ebola_path)
ebola_df["Date"]=pd.to_datetime(ebola_df["date"])

zika_df = pd.read_csv(zika_path)
zika_df["Date"]=pd.to_datetime(zika_df["report_date"])

sars_df = pd.read_csv(sars_path)
sars_df["Date"]=pd.to_datetime(sars_df["date"])

In [4]:
ebola=ebola_df.set_index(ebola_df['Date']).drop(columns=['date','Date'])

In [5]:
# ebola

In [6]:
sars=sars_df.set_index(sars_df['Date']).drop(columns=['date','Date'])

In [7]:
# sars.head()

In [8]:
zika=zika_df.set_index(zika_df['Date']).drop(columns=['Date'])

In [9]:
# zika.head()

---
### Looks good, now onto news data.
Going to pull in the news code from Travis's work. The goal is to create a dataframe with news and disease information. 
It is possible that we don't pull this information into a single dataframe, and just plot separate dataframes. 
As long as the dates line up, we should be able to see visual effects in a plot. 

However, if we are trying to do some math on the datasets, we would want them combined.

## HERE'S WHAT I DID:
Ok, so we know that on some days of the news cycle, a bunch of articles came out. So i wanted to get an accurate descrption of that fact. 
This way we can possibly play with the data two ways. 1: we can scale the velocity/position/acceleration of the illness/death with the number of news articles that came out that day about the disease. 2: we can just use it as a yes/no value as we did in travis's previous code. 
Either way, i concatenated the data. It was taking WAY too long to write a for loop. 

I also put everything into a function because I wanted to learn how to do it, and it'll take less space. Cheers!

---
# NEWS DATA IMPORT AND CLEAN

In [10]:
ebola_news_path = Path('../Travis_Smith/Ebola_edit.csv')
sars_news_path = Path('../Travis_Smith/Sars_edit.csv')
zika_news_path = Path('../Travis_Smith/Zika_edit.csv')

#READ IN DATA,  CONVERT TO DATE-TIME FORMAT.
ebola_news_df = pd.read_csv(ebola_news_path)
ebola_news_df["Date"]=pd.to_datetime(ebola_news_df["Date_format"])

sars_news_df = pd.read_csv(sars_news_path)
sars_news_df["Date"]=pd.to_datetime(sars_news_df["Format_date"])

zika_news_df = pd.read_csv(zika_news_path)
zika_news_df["Date"]=pd.to_datetime(zika_news_df["Date_format"])

In [11]:
#

In [12]:
def news_count(df): 
    df = df.drop(columns=['dates','Unnamed: 0']).sort_values('Date')
    df = df.groupby(['Date']).size().reset_index(name='news_count')
    df = df.set_index('Date')
    return df

In [13]:
# ebola_news = news_count(ebola_news_df)
# ebola_news

In [14]:
def concat_df(df1,disease):
    x = pd.DataFrame()
    x = pd.concat([disease,df1],axis = 'columns', join = 'inner')
    return x

In [15]:
ebola_news = news_count(ebola_news_df)
ebola_con = concat_df(ebola_news,disease=ebola)

In [16]:
sars_news = news_count(sars_news_df)
sars_con = concat_df(sars_news,disease=sars)

In [17]:
zika_news = news_count(zika_news_df)
zika_con = concat_df(zika_news,disease=zika)

In [18]:
ebola_con.head()

Unnamed: 0_level_0,death,illness,death_pos,death_v,death_a,illness_pos,illness_v,illness_a,news_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-10-01,3338.0,7178.0,49.4,-7.52,-4.9,120.8,-6.94,3.01,3
2014-10-03,3439.0,7492.0,50.5,0.55,4.04,157.0,18.1,12.52,1
2014-10-08,3865.0,8033.0,85.2,6.94,1.28,108.2,-9.76,-5.57,3
2014-10-10,4032.0,8399.0,83.5,-0.85,-3.9,183.0,37.4,23.58,5
2014-10-15,4493.0,8997.0,92.2,1.74,0.52,119.6,-12.68,-10.02,2


In [19]:
sars_con.head()

Unnamed: 0_level_0,illness,death,illness_pos,illness_v,illness_a,death_pos,death_v,death_a,news_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2003-03-17,167,4,0.0,0.0,0.0,0.0,0.0,0.0,1
2003-03-18,219,4,52.0,52.0,52.0,0.0,0.0,0.0,3
2003-03-19,264,9,45.0,-7.0,-59.0,5.0,5.0,5.0,3
2003-03-20,306,10,42.0,-3.0,4.0,1.0,-4.0,-9.0,6
2003-03-21,350,10,44.0,2.0,5.0,0.0,-1.0,3.0,1


In [20]:
zika_con.head()

Unnamed: 0_level_0,report_date,ill_pos,illness,illness_v,illness_a,news_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-02-16,2016-02-16,122,122,0.0,0.0,1
2016-02-23,2016-02-23,198,320,10.86,1.55,1
2016-03-29,2016-03-29,154,1072,-4.86,-0.69,1
2016-04-05,2016-04-05,296,1368,20.29,3.59,1
2016-04-19,2016-04-19,790,2512,62.29,7.71,1


---
### LOOKING GOOD. NEXT, ON TO MARKETS ANALYSIS
HAVEN'T MESSED OR FINISHED THIS YET. 

In [1]:
# ebola_spy_path = Path('../Andres_Mejia/SPY_Sectors_Ebola_AM.csv')
# sars_spy_path = Path('../Andres_Mejia/SPY_Sectors_SARS_AM.csv')
# zika_spy_path = Path('../Andres_Mejia/SPY_Sectors_Zika_AM.csv')

# #READ IN DATA, DATE-TIME FORMAT.
# ebola_spy_df = pd.read_csv(ebola_spy_path)
# ebola_spy_df["Date"]=pd.to_datetime(ebola_spy_df["Date"])

# sars_spy_df = pd.read_csv(sars_spy_path)
# sars_spy_df["Date"]=pd.to_datetime(sars_spy_df["Date"])

# zika_spy_df = pd.read_csv(zika_spy_path)
# zika_spy_df["Date"]=pd.to_datetime(zika_spy_df["Date"])

In [3]:
# ebola_spy.head()

In [4]:
# sars_spy.head()

In [5]:
# zika_spy.head()

---
That looks good, but there are a lot of columns.

In [6]:
# sars_spy.columns.unique()

It'd be useful to run a function to call back a specific one, or group of these, but in the mean time I'm only going to check against the SP500 $returns$.

---

# DATA COMBINATIONS
Need to only pull market information for the data ranges that I have in hand. 
First, I need to drop unnecessary columns of information for my analysis. 
Going to concatenate the dataframes together and plot them. 

In [7]:
# ebola_spy_con = pd.DataFrame()
# sars_spy_l = pd.DataFrame()
# zika_spy_l = pd.DataFrame()

# def index_returns(df,index_name):
#     ebola_spy_l = pd.DataFrame()
#     ebola_spy_l = pd.DataFrame(data=df[index_name],
#                                    index=df.index,
#                                    )
#     ebola_spy_l = ebola_spy_l.pct_change().dropna()
#     ebola_spy_con = pd.concat([df,ebola_spy_1], axis="columns", join="inner", sort=False)
#     return ebola_spy_con
# # -------------------------------------------------------------------------- #    
# #     if df == ebola_sp:
# #         ebola_spy_l = pd.DataFrame(data={disease}_spy['index_name'],
# #                                    index={disease}_spy.index,
# #                                    columns = 'index_name',)
# # -------------------------------------------------------------------------- #        
# #     {disease}_spy_l = pd.DataFrame(data={disease}_spy['index_name'],
# #                                    index={disease}_spy.index,
# #                                    columns = 'index_name',)
# #     {disease}_spy_l = {disease}_spy_l.pct_change()
# #     return {disease}_spy_l

In [9]:
# ebola_spy_l = index_returns(ebola_spy,index_name='SP500_Close')

In [10]:
# ebola_spy_l