# Yahoo data crawler

 - vwap
 - [docs](https://polygon.io/docs/stocks/getting-started)

In [1]:
# hide output
%%capture output

!pip install setuptools
!pip install -U polygon-api-client

In [2]:
from polygon import RESTClient
import os
import numpy as np
import pandas as pd
from google.colab import drive


# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Sentimental/'


# get stock list
stocks = np.load(os.path.join(path, 'data', 'stock_list.npy'))


# connect to yahoo finance
with open(os.path.join(path, 'data', 'api_key_yahoo.txt'), 'r') as file:
    API = file.read()
api_key_yahoo = API
client = RESTClient(API)  # POLYGON_API_KEY environment variable is used
result = client.get_market_status()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# data source test

### *yahoo website*

In [3]:
request = client.get_grouped_daily_aggs(
    "2023-02-03",
)

for x in request:
    if x.ticker in stocks:
        print(x.ticker, x.vwap)

AAPL 154.2437
TSLA 193.3842


## *get trading dates*

In [4]:
from datetime import date

start_date = '01/01/2022'
end_date = '01/01/2024'
data_tmp = client.get_aggs(stocks[0], 1, 'day', pd.Timestamp(start_date), pd.Timestamp(end_date))
times = pd.DataFrame([x.timestamp for x in data_tmp], columns = ['time'])
trading_dates = pd.DatetimeIndex(pd.to_datetime(times['time'], unit='ms').dt.date)
trading_dates

DatetimeIndex(['2021-12-31', '2022-01-03', '2022-01-04', '2022-01-05',
               '2022-01-06', '2022-01-07', '2022-01-10', '2022-01-11',
               '2022-01-12', '2022-01-13',
               ...
               '2023-12-15', '2023-12-18', '2023-12-19', '2023-12-20',
               '2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27',
               '2023-12-28', '2023-12-29'],
              dtype='datetime64[ns]', name='time', length=502, freq=None)

## *get vwap*

In [5]:
df = pd.DataFrame(index = trading_dates, columns = stocks)

for stock in stocks:
    data = client.get_aggs(stock, 1, 'day', pd.Timestamp(start_date), pd.Timestamp(end_date))
    df[stock] = [row.vwap for row in data]

df

Unnamed: 0_level_0,AAPL,TSLA
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-31,177.8004,355.7282
2022-01-03,181.4156,390.2519
2022-01-04,180.5574,387.3708
2022-01-05,177.2884,375.5189
2022-01-06,173.0800,353.4321
...,...,...
2023-12-22,194.1013,255.1771
2023-12-26,193.1713,256.3719
2023-12-27,192.5679,261.7265
2023-12-28,193.9222,258.4918


## *Save*

In [6]:
df.to_csv(os.path.join(path, 'data', 'vwap_2022_2023.csv'))
tmp = pd.read_csv(os.path.join(path, 'data', 'vwap_2022_2023.csv'), index_col = 'time')
tmp

Unnamed: 0_level_0,AAPL,TSLA
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-31,177.8004,355.7282
2022-01-03,181.4156,390.2519
2022-01-04,180.5574,387.3708
2022-01-05,177.2884,375.5189
2022-01-06,173.0800,353.4321
...,...,...
2023-12-22,194.1013,255.1771
2023-12-26,193.1713,256.3719
2023-12-27,192.5679,261.7265
2023-12-28,193.9222,258.4918


In [7]:
def get_vwap(stock, date):
    client = RESTClient(api_key_yahoo)

    date = pd.Timestamp(date)
    start_date = date - pd.Timedelta(7, unit='D')
    end_date = pd.Timestamp('01/01/2030')

    data = client.get_aggs(stock[:4], 1, 'day', start_date, end_date)
    times = pd.DataFrame([x.timestamp for x in data], columns = ['time'])
    trading_dates = pd.DatetimeIndex(pd.to_datetime(times['time'], unit='ms').dt.date)

    for i in range(len(trading_dates)):
        if trading_dates[i] > date:
            break

    if i == -1:
        return -1
    else:
        return data[i].vwap

In [9]:
ans = get_vwap('AAPL.US', '01/03/2024')
ans

182.0183