## This notebook explains the data extraction and how the data looks like through all the transformations

### Importing libraries

In [73]:
import pandas as pd
import yfinance as yf
import numpy as np

### Exctracting data from Yahoo Finance API

Here, we need to specify:
- company_symbol: the company (or multiple companies) symbol
- start_date: the date you want to start extracting data
- end_date: the before last date you want to extract data. The date that you specify here is not included

In [75]:
company_symbol = ['AAPL', 'AMZN', 'GOOGL', 'MSFT']
start_date = '2023-08-01'
end_date = '2023-09-01'

df = yf.download(tickers=company_symbol, start=start_date, end=end_date, interval='1h')
df.head(3)

[*********************100%%**********************]  4 of 4 completed


Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,Close,Close,High,High,...,Low,Low,Open,Open,Open,Open,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,AMZN,GOOGL,MSFT,AAPL,AMZN,GOOGL,MSFT,AAPL,AMZN,...,GOOGL,MSFT,AAPL,AMZN,GOOGL,MSFT,AAPL,AMZN,GOOGL,MSFT
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-08-01 09:30:00-04:00,195.475006,131.949997,131.990005,335.720001,195.475006,131.949997,131.990005,335.720001,196.729996,133.320007,...,130.679993,333.700012,196.235001,133.110001,130.774994,335.190002,9552101,12831826,7684072,4617472
2023-08-01 10:30:00-04:00,196.115005,132.619995,132.160004,337.713287,196.115005,132.619995,132.160004,337.713287,196.179993,132.830002,...,131.929993,335.619995,195.479996,131.940002,131.979996,335.757996,4389743,5598671,2815607,3152370
2023-08-01 11:30:00-04:00,195.497299,132.179993,131.714996,337.234985,195.497299,132.179993,131.714996,337.234985,196.145004,132.955002,...,131.690002,337.0,196.110001,132.609894,132.149994,337.700012,2991657,4845837,1910273,1669609


### Reshaping DataFrame to have more readable column indices

In [76]:
df = df.stack(level=1).reset_index().rename(columns={'level_1':'company'})
df.head()

Unnamed: 0,Datetime,company,Adj Close,Close,High,Low,Open,Volume
0,2023-08-01 09:30:00-04:00,AAPL,195.475006,195.475006,196.729996,195.279999,196.235001,9552101
1,2023-08-01 09:30:00-04:00,AMZN,131.949997,131.949997,133.320007,131.619904,133.110001,12831826
2,2023-08-01 09:30:00-04:00,GOOGL,131.990005,131.990005,132.350006,130.679993,130.774994,7684072
3,2023-08-01 09:30:00-04:00,MSFT,335.720001,335.720001,335.820007,333.700012,335.190002,4617472
4,2023-08-01 10:30:00-04:00,AAPL,196.115005,196.115005,196.179993,195.330002,195.479996,4389743


### Standardizing column names and rounding float numbers to 2 digits

In [77]:
df.columns = [i.replace(' ', '_').lower() for i in df.columns]

numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].round(2)
df.head()

Unnamed: 0,datetime,company,adj_close,close,high,low,open,volume
0,2023-08-01 09:30:00-04:00,AAPL,195.48,195.48,196.73,195.28,196.24,9552101
1,2023-08-01 09:30:00-04:00,AMZN,131.95,131.95,133.32,131.62,133.11,12831826
2,2023-08-01 09:30:00-04:00,GOOGL,131.99,131.99,132.35,130.68,130.77,7684072
3,2023-08-01 09:30:00-04:00,MSFT,335.72,335.72,335.82,333.7,335.19,4617472
4,2023-08-01 10:30:00-04:00,AAPL,196.12,196.12,196.18,195.33,195.48,4389743


### Verifying that variable types are correct

In [78]:
df.dtypes

datetime     datetime64[ns, America/New_York]
company                                object
adj_close                             float64
close                                 float64
high                                  float64
low                                   float64
open                                  float64
volume                                  int64
dtype: object