# ETL_EDA
This file records the process of acquiring raw data, traforming them, and loading them into a MongoDB. The data are store (almost) in their raw form.

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from mongodb import *
import json

In [2]:
# handle covid data in covid database with two collections named "cases" and "states"
dbname = 'covid'

date_str_to_num = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                   'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                   'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}

def get_cases_data(db=None):
    url = "https://covidtracking.com/data/national/cases"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    data = []
    for tr in soup.body.div.div.main.find_all('tr')[1:]:
        raw = tr.find_all('span')
        date_str, case, new_case = raw[1].contents[0], raw[3].contents[0], raw[5].contents[0]
        month_str, day, year = date_str.replace(",", "").split(" ")
        month = date_str_to_num[month_str]
        date = "{}-{}-{}".format(year, month, day)
        data.append([date, case, new_case])
    df = pd.DataFrame(data=data, columns=['date','case','new_case'])
    df['case'] = [int(df['case'][i].replace(',','')) for i in range(0,len(df))]
    df['new_case'] = [int(df['new_case'][i].replace(',','')) for i in range(0,len(df))]
    if db is not None:
        col = create_collection(db, 'cases')
        insert(df, col)
    return df


def get_states_data(db=None):
    url = "https://data.cdc.gov/resource/9mfq-cb36.json"
    response = requests.get(url)
    res = json.loads(response.text)
    df = pd.DataFrame(res)
    dates = [df['submission_date'].loc[i].split('T')[0] for i in range(len(df['submission_date']))]
    df['date'] = dates
    if db is not None:
        col = create_collection(db, 'states')
        insert(df, col)
    return df


def create_collection(handle, collection_name):
    collection = handle[collection_name]
    collection.drop()
    print('create collection {}'.format(collection_name))
    return collection


def insert(data, handle):
    records = json.loads(data.T.to_json()).values()
    handle.insert(records)



db = connect(dbname)
cases_data = get_cases_data(db)
states_data = get_states_data(db)

create collection cases


  handle.insert(records)


create collection states


In [3]:
# handle stock price data in stock database with only one collection named "historical"

dbname = "stock"
collection_name = "historical"

url = 'https://query1.finance.yahoo.com/v8/finance/chart/{}'

codes = ['AAPL', 'MSFT', 'GOOG', 'FB', 'AMZN', 'WMT', 'GE', 'MMM', 'AMT', 'JNJ', 'PFE', 'JPM', 'V', 'XOM',
         '^GSPC', '^DJI', 'GC=F', 'CL=F']


def get_stock_data():
    all = pd.DataFrame()
    for code in codes:
        period = "1y"
        interval = "1d"
        params = {}
        params["range"] = period
        params["interval"] = interval.lower()
        res = requests.get(url.format(code), params=params)
        res_json = json.loads(res.text)
        data = res_json['chart']['result'][0]
        timestamps = data['timestamp']
        date = [pd.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in timestamps]
        indicators_data = data['indicators']['quote'][0]
        volume = indicators_data['volume']
        close = indicators_data['close']
        open = indicators_data['open']
        high = indicators_data['high']
        low = indicators_data['low']
        df = pd.DataFrame({'date':date, 'volume':volume, 'close':close, 'open':open, 'high': high, 'low':low, 'code': [code for _ in volume]})
        all = all.append(df, ignore_index=True)
    return all


def create_collection(handle):
    collection = handle[collection_name]
    collection.drop()
    print('create collection {}'.format(collection_name))
    return collection


def insert(data, handle):
    records = json.loads(data.T.to_json()).values()
    handle.insert(records)



data = get_stock_data()

# only focus on stock data during pandemic
data = data[data['date'] >= '2020-01-22']
db = connect(dbname)
col = create_collection(db)
insert(data, col)

  date = [pd.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in timestamps]


create collection historical


  handle.insert(records)
