# References

https://github.com/ranaroussi/yfinance/tree/master/yfinance

https://github.com/peterlacour/finpie

https://github.com/mariostoev/finviz

https://github.com/lit26/finvizfinance

https://stackoverflow.com/questions/61154530/calling-back-end-api-of-cnbc-in-python

https://github.com/wilsonfreitas/awesome-quant#python

# Update Datasets

Outline:

- PreReq
- Ticker List
- Static Data:
-- Quotes
-- Description
-- Fundamental
- Live Data:
-- Intraday Quotes
-- News

## Pre-Requisites

### G-Drive Connect
Authorise google colaboratory to access drive.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
from IPython.display import clear_output 
clear_output()

### Requirements

In [2]:
!pip install yfinance finvizfinance finviz finpie yahoo_fin
clear_output()

### Import dependencies

In [3]:
import pandas as pd
import finpie
import yfinance as yf
import finviz
import finvizfinance
import time
import progressbar
from finvizfinance.screener.overview import Overview
from yahoo_fin import stock_info as si
import requests
from pandas_datareader import data as pdr
import warnings
warnings.filterwarnings("ignore")
clear_output()

## Static Data

The static data records the historic price values for the equities. This should be updated after the markets close at 8 pm EST.

### Ticker List
The ticker symbol list needs to be updated as new symbols are being listed/delisted everyday.

In [4]:
def update_ticker_list():
  # Download latest ticker list from Nasdaq
  ticker_list=finpie.nasdaq_tickers()
  # Select Symbol Column
  ticker_list=ticker_list['Symbol']
  # Save ticker list to csv
  ticker_list.to_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/symbols.csv',index=None)
  # Download latest ticker list in S&P500
  sp500_ticker_list = si.tickers_sp500(include_company_data=True)
  # Select Symbol Column
  sp500_ticker_list=sp500_ticker_list['Symbol']
  # Save ticker list to csv
  sp500_ticker_list.to_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/S&P500.csv',index=None)
  # create custom list of stocks
  my_list={"Symbol":["AAL","AAPL","AMZN","BBBY","DAL","EBAY","GME","GOOG","KO","MRNA","MSFT","MVIS","NFLX","NIO","NNDM","OCGN","PEP","SNAP","SNDL","TLRY","TSLA","TWTR","UBER","ZM"]}
  # convert to dataframe
  my_list=pd.DataFrame(my_list)
  # save to csv
  my_list.to_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/my_list.csv',index=None)

### Stock Data

The download functionality takes about 3 hours to completely download historic data for all 10000+ symbols along with warrants. Modify your search criteria for downloading datasets to save time.

In [5]:
def set_status(ticker_number,fname):
  ticker_number=str(ticker_number)
  with open(f'/content/drive/My Drive/Master Project/Dataset/logs/{fname}_data_update_stat.txt', 'w') as writefile:
      writefile.write(ticker_number)

In [6]:
def get_status(fname):
  ticker_number=0
  try:
    with open(f'/content/drive/My Drive/Master Project/Dataset/logs/{fname}_data_update_stat.txt', 'r') as logfile:
      ticker_number=int(logfile.read())
  except:
    pass
  return ticker_number

In [7]:
def fetching_news(ticker,url="https://api.queryly.com/cnbc/json.aspx"):
  #stock_fundamentals=finpie.Fundamentals(ticker)
  #stock_profile=stock_fundamentals.profile()
  #company=stock_profile['company_name'][0]
  #ticker="AAPL"
  #company="Apple Inc"
  #search_query=company+" "+ticker
  #search_query=company
  search_query=ticker
  params = {
      "queryly_key": "31a35d40a9a64ab3",
      "query": search_query,
      "endindex": "0",
      "batchsize": "100",
      "callback": "",
      "showfaceted": "true",
      "timezoneoffset": "-120",
      "facetedfields": "formats",
      "facetedkey": "formats|",
      "facetedvalue":
      "!Press Release|",
      "needtoptickers": "1",
      "additionalindexes": "4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"
  }
  header = ["cn:title", "_pubDate", "cn:liveURL", "description"]
  with requests.Session() as req:
      allin = []
      for page, item in enumerate(range(0, 100000, 100)):
          #print(f"Extracting Page# {page +1}")
          params["endindex"] = item
          r = req.get(url, params=params).json()
          empty=True
          for loop in r['results']:
            allin.append([loop[x] for x in header])
            empty=False
          if empty:
            break
  result = pd.DataFrame(allin, columns=["Title", "Date", "Url", "Description"])
  result.to_csv('/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/news_data/{}.csv'.format(ticker),index=False)
  return result

In [8]:
def fetch_option_chain(ticker):
  calls,puts=finpie.yahoo_option_chain(ticker)
  calls.to_csv('/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/option_chain/{}_calls.csv'.format(ticker))
  puts.to_csv('/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/option_chain/{}_puts.csv'.format(ticker))

In [9]:
def fetch_historic_price(ticker):
  data=finpie.historical_prices(ticker)
  data.to_csv('/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/market_historic/{}.csv'.format(ticker))

In [10]:
def fetch_ticker_profile(ticker):
  stock=finpie.Fundamentals(ticker)
  data=stock.profile().join(stock.key_metrics())
  data.to_csv('/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/stock_profile/{}.csv'.format(ticker))

In [11]:
def combine_dfs(ticker):
  main=pd.read_csv(f'/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/market_historic/{ticker}.csv')
  news=pd.read_csv(f'/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/news_data/{ticker}.csv')
  main=main.set_index('date')
  news['Date']=news['Date'].astype('datetime64[D]')
  news=news.drop(columns=["Url","Description"])
  news=news.rename(columns={"Date":"date","Title":"headline"})
  news=news.groupby('date').agg({'headline': ' '.join})
  news['headline_vec']=news['headline']
  i=0
  for x in news['headline']:
    news['headline_vec'][i]=int.from_bytes(bytes=x.encode(),byteorder='big')
    i+=1
  combine=main.join(news,sort=True)
  combine=combine.dropna()
  combine.to_csv(f'/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/combined/{ticker}.csv')

In [12]:
def update_historical_data(letter="sp500"):
  symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/symbols.csv')
  if letter == "all":
    symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/symbols.csv')
  elif letter == "sp500":
    symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/S&P500.csv')
  elif letter == "my_list":
    symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/my_list.csv')
  else:
    symbols=symbols[[x.startswith(letter) for x in symbols['Symbol']]]

  bar = progressbar.ProgressBar(
      widgets=[
          progressbar.Counter(format='%(value)5d/%(max_value)d | %(dynamic_messages)6s | '),
          progressbar.Timer(format= 'Elapsed: %(elapsed)s '),
          progressbar.Bar('*'),
          ' ',
          progressbar.AdaptiveETA(),
      ],
  )
  log_start=get_status(letter)
  shift=0
  for symbol in bar(symbols['Symbol'][log_start:]):
    try:
      bar.dynamic_messages=symbol
      fetch_ticker_profile(symbol)
      fetch_historic_price(symbol)
      fetching_news(symbol)
      fetch_option_chain(symbol)
      #combine_dfs(symbol)
      shift+=1
      time.sleep(1)
      val=(log_start+shift)%symbols.last_valid_index()
      set_status(val,letter)
    except:
      pass
  set_status(0,letter)

## Latest Intraday (2 years)

In [13]:
def fetch_intraday_price(data,symbols):
  list_type="intraday"
  bar = progressbar.ProgressBar(
      widgets=[
          progressbar.Counter(format='Saving to File | %(value)5d/%(max_value)d | %(dynamic_messages)6s | '),
          progressbar.Timer(format= 'Elapsed: %(elapsed)s '),
          progressbar.Bar('*'),
          ' ',
          progressbar.AdaptiveETA(),
      ],
  )
  log_start=get_status(list_type)
  shift=0
  for symbol in bar(symbols['Symbol'][log_start:]):
    try:
      bar.dynamic_messages=symbol
      stock=data[symbol]
      stock.dropna(inplace=True)
      stock.to_csv(f'/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/market_intraday/{symbol}.csv')
      shift+=1
      val=(log_start+shift)%symbols.last_valid_index()
      set_status(val,letter)
    except:
      pass
  set_status(0,list_type)
  #clear_output()

In [14]:
def update_intraday_master(letter="sp500"):
  symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/symbols.csv')
  if letter == "all":
    symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/symbols.csv')
  elif letter == "sp500":
    symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/S&P500.csv')
  elif letter == "my_list":
    symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/my_list.csv')
  else:
    symbols=symbols[[x.startswith(letter) for x in symbols['Symbol']]]
  stock_list=""
  i=1
  for x in symbols['Symbol']:
    if i==1:
      stock_list+=x
      i=2
    else:
      stock_list+=" "+x  
  yf.pdr_override()
  data=pdr.get_data_yahoo(stock_list,interval='60m',period='2y',group_by='ticker')
  data.to_csv(f'/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/market_intraday/__master_dataset_{letter}__.csv')
  clear_output()
  fetch_intraday_price(data,symbols)

In [15]:
#update_intraday_master()

In [16]:
#data['AAPL'][data['AAPL']['Volume']<10000]

In [17]:
'''
symbols=pd.read_csv('/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/S&P500.csv')
stock_list=""
i=1
for x in symbols['Symbol']:
  if i==1:
    stock_list+=x
    i=2
  else:
    stock_list+=" "+x  
'''

'\nsymbols=pd.read_csv(\'/content/drive/My Drive/Master Project/Dataset/stocks_ticker_list_latest/S&P500.csv\')\nstock_list=""\ni=1\nfor x in symbols[\'Symbol\']:\n  if i==1:\n    stock_list+=x\n    i=2\n  else:\n    stock_list+=" "+x  \n'

In [18]:
#data=yf.download(stock_list,interval='60m',period='2y',group_by='ticker')
#data=pdr.get_data_yahoo(stock_list,period='2y',group_by='ticker')

In [19]:
#data.to_csv(f'/content/drive/My Drive/Master Project/Dataset/stock_market_datasets/market_historic/__master_dataset_sp500__.csv')

# Main

Use the functions to update intraday and historical data from 

In [20]:
update_ticker_list()

In [21]:
update_intraday_master("W")

Saving to File |   202/202 |    WYY | Elapsed: 0:00:24 |*******| Time:  0:00:24


In [22]:
update_historical_data("MSFT")

    1/1 |   MSFT | Elapsed: 0:00:44 |**************************| Time:  0:00:44


In [21]:
print("Sample News Data")
fetching_news("MSFT")

Sample News Data


Unnamed: 0,Title,Date,Url,Description
0,"Einhorn takes aim at Chamath Palihapitiya, Elo...",4/15/2021 8:44:45 PM,https://www.cnbc.com/video/2021/04/15/einhorn-...,CNBC's Leslie Picker reports on how Greenlight...
1,GameStop is 'doing so many things right' — Cra...,3/24/2021 4:09:51 PM,https://www.cnbc.com/video/2021/03/24/gamestop...,GameStop missed on the top and bottom lines of...
2,"Given GME volatility, size investments accordi...",2/25/2021 10:33:01 PM,https://www.cnbc.com/video/2021/02/25/given-gm...,"CNBC's ""Closing Bell"" team breaks down GameSto..."
3,Reddit trader 'Roaring Kitty' explains his GME...,2/18/2021 9:33:21 PM,https://www.cnbc.com/video/2021/02/18/reddit-t...,The House Committee on Financial Services hold...
4,Here's the difference between Reddit-fueled GM...,2/11/2021 10:34:13 PM,https://www.cnbc.com/video/2021/02/11/heres-th...,"CNBC's ""Closing Bell"" team discusses Reddit tr..."
...,...,...,...,...
935,"The Word on Merger Mania, Specialty Retail and...",5/26/2007 2:16:53 AM,https://www.cnbc.com/id/18866000,"Day Off, Deals On?: The headline: Will The Lon..."
936,"Silly Dylan, ""Picks"" Are For Kids!",4/27/2007 3:01:20 AM,https://www.cnbc.com/id/18337627,It seems like traders are getting younger and ...
937,No. 3 - Wii Are The World,4/12/2007 2:49:54 AM,https://www.cnbc.com/id/18063563,Underdog Nintendo Wii Runaway Winner of Consol...
938,"The Word on Tiffany's, GameStop & More...",3/27/2007 3:16:42 AM,https://www.cnbc.com/id/17803172,DIAMOND IN THE ROUGH:The news: Upscale jewelry...


In [23]:
update_intraday_master("my_list")

Saving to File |    24/24 |     ZM | Elapsed: 0:00:09 |********| Time:  0:00:09


In [24]:
update_historical_data("my_list")

   24/24 |     ZM | Elapsed: 0:04:04 |*************************| Time:  0:04:04


In [22]:
sp500_ticker_list = si.tickers_sp500(include_company_data=True)
sp500_ticker_list

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands Inc,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [25]:
update_intraday_master("sp500")

Saving to File |   505/505 |    ZTS | Elapsed: 0:03:28 |*******| Time:  0:03:28


In [26]:
update_historical_data("sp500")

  379/379 |    ZTS | Elapsed: 0:29:18 |************************| Time:  0:29:18


In [23]:
all_ticker_list=finpie.nasdaq_tickers()
all_ticker_list

Unnamed: 0,Symbol,Security Name
0,AACG,ATA Creativity Global - American Depositary Sh...
1,AACQ,Artius Acquisition Inc. - Class A Common Stock
2,AACQU,Artius Acquisition Inc. - Unit consisting of o...
3,AACQW,Artius Acquisition Inc. - Warrant
4,AAL,"American Airlines Group, Inc. - Common Stock"
...,...,...
10542,ZTO,ZTO Express (Cayman) Inc. American Depositary ...
10543,ZTR,Virtus Total Return Fund Inc.
10544,ZTS,Zoetis Inc. Class A Common Stock
10545,ZUO,"Zuora, Inc. Class A Common Stock"


In [None]:
update_historical_data("all")

   79/10547 |   ADMS | Elapsed: 0:03:39 |                      | ETA:   5:50:56

In [None]:
update_intraday_master("all")

                                                                               Saving to File |     0/10547 |     {} | Elapsed: 0:00:00 |     | ETA:  --:--:--