<a href="https://colab.research.google.com/github/yorkjong/stock-reports/blob/main/notebooks/stock_list.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Utilities

In [1]:
# @title GitHub
import base64
import requests
import pandas as pd
from io import StringIO


class GitHub:
    def __init__(self, repo_owner, repo_name, token, dir='', branch='main'):
        base = 'https://api.github.com/repos'
        dir = dir.strip('/')
        if dir:
            self.base_url = f'{base}/{repo_owner}/{repo_name}/contents/{dir}'
        else:
            self.base_url = f'{base}/{repo_owner}/{repo_name}/contents'
        self.branch = branch
        self.token = token

    def _request(self, method, url, headers=None, params=None, json=None):
        response = requests.request(method, url, headers=headers,
                                    params=params, json=json)
        if response.status_code in [200, 201]:
            return response.json()
        elif response.status_code == 404:
            return None
        else:
            print(f"Request failed: {response.status_code} - {response.json()}")
            return None

    def file_exists(self, file_path):
        url = f'{self.base_url}/{file_path}'
        headers = {
            'Authorization': f'token {self.token}',
            'Accept': 'application/vnd.github.v3+json',
        }
        response = requests.get(url, headers=headers,
                                params={'ref': self.branch})
        if response.status_code == 200:
            return True
        elif response.status_code == 404:
            return False
        else:
            print(f"Request failed: {response.status_code} - {response.json()}")
            return None

    def list_filenames(self, dir_path=''):
        url = f'{self.base_url}/{dir_path}'
        headers = {
            'Authorization': f'token {self.token}',
            'Accept': 'application/vnd.github.v3+json',
        }
        response = requests.get(url, headers=headers,
                                params={'ref': self.branch})
        if response.status_code == 200:
            files = response.json()
            return [item['name'] for item in files]
        else:
            print(f"Request failed: {response.status_code} - {response.json()}")
            return []

    def download_file(self, file_path):
        url = f'{self.base_url}/{file_path}'
        headers = {
            'Authorization': f'token {self.token}',
            'Accept': 'application/vnd.github.v3+json',
        }

        file_info = self._request('GET', url, headers=headers,
                                  params={'ref': self.branch})
        if file_info:
            response = requests.get(file_info['download_url'])
            if response.status_code == 200:
                return StringIO(response.text)
            else:
                print(f"Failed to download file: "
                      f"{response.status_code} - {response.text}")
        else:
            print(f"File '{file_path}' does not exist. Cannot download.")
        return None

    def download_csv(self, file_path):
        file_content = self.download_file(file_path)
        if file_content:
            return pd.read_csv(file_content)
        else:
            return pd.DataFrame()

    def upload_file(self, file_path, content):
        url = f'{self.base_url}/{file_path}'

        # Encode the content to base64
        encoded_content = base64.b64encode(content.encode()).decode()
        payload = {
            'message': 'Uploading file',
            'content': encoded_content,
            'branch': self.branch
        }

        headers = {
            'Authorization': f'token {self.token}',
            'Accept': 'application/vnd.github.v3+json'
        }

        # Check if the file already exists to get the current sha
        file_info = self._request('GET', url, headers=headers,
                                  params={'ref': self.branch})
        # If the file exists, get the current SHA
        if file_info:
            payload['sha'] = file_info.get('sha')

        # PUT request to create or update the file
        self._request('PUT', url, headers=headers, json=payload)

    def upload_df_as_csv(self, file_path, df):
        """Upload a DataFrame to a CSV file."""
        if not file_path.endswith('.csv'):
            file_path += '.csv'
        csv_content = df.to_csv(index=False)
        self.upload_file(file_path, csv_content)

    def remove_file(self, file_path):
        if not self.file_exists(file_path):
            print(f"File '{file_path}' does not exist. Skipping deletion.")
            return

        url = f'{self.base_url}/{file_path}'
        headers = {
            'Authorization': f'token {self.token}',
            'Accept': 'application/vnd.github.v3+json'
        }

        # Fetch the file info to get the SHA needed for deletion
        file_info = self._request('GET', url, headers=headers)
        if file_info:
            payload = {
                'message': 'Deleting file',
                'sha': file_info['sha'],
                'branch': self.branch
            }
            self._request('DELETE', url, headers=headers, json=payload)

#-------------------------------------------------------------------------------

from google.colab import userdata

github = GitHub(
    repo_owner='YorkJong',
    repo_name='stock-reports',
    token=userdata.get('GithubToken.stock-reports'),
    dir='stock_list',
)

In [2]:
# @title From Wikipedia

import functools
from io import StringIO

import requests
import pandas as pd
from bs4 import BeautifulSoup


def table_from_wikipedia(article,
                         class_='wikitable sortable',  id='constituents'):
    """
    Fetches a table from a Wikipedia article and returns it as a pandas
    DataFrame.

    Args:
        article (str): The name of the Wikipedia article.
        class_ (str): The class attribute of the table to retrieve.
        id (str): The id attribute of the table to retrieve.

    Returns:
        pandas.DataFrame: The retrieved table.
    """
    url = f"https://en.wikipedia.org/wiki/{article}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    attrs = {}
    if class_:
        attrs['class'] = class_
    if id:
        attrs['id'] = id
    tag = soup.find('table', attrs=attrs)
    return pd.read_html(StringIO(str(tag)))[0]


def symbols_from_wikipedia_table(article,
                                 class_='wikitable sortable',
                                 id='constituents'):
    """
    Extracts stock symbols from a table in a Wikipedia article.

    Args:
        article (str): The name of the Wikipedia article.
        class_ (str, optional): The class attribute of the table. Defaults to
            'wikitable sortable'.
        id (str, optional): The id attribute of the table. Defaults to
            'constituents'.

    Returns:
        list: A list of stock symbols.
    """
    df = table_from_wikipedia(article, class_, id)
    if 'Symbol' in df.columns:
        return df['Symbol'].tolist()
    elif 'Ticker' in df.columns:
        return df['Ticker'].tolist()
    return []


spx_tickers = functools.partial(
        symbols_from_wikipedia_table, 'List_of_S%26P_500_companies')
djia_tickers = functools.partial(
        symbols_from_wikipedia_table, 'Dow_Jones_Industrial_Average')
ndx_tickers = functools.partial(
        symbols_from_wikipedia_table, 'Nasdaq-100')
rui_tickers = functools.partial(
        symbols_from_wikipedia_table, 'Russell_1000_Index', id=None)


In [3]:
# @title From BullishBears Website

import functools

import pandas as pd


def table_from_bullishbears(article):
    """
    Fetches the first table from a specified Bullish Bears article.

    Args:
        article (str): The relative path of the article on the Bullish Bears
            website.

    Returns:
        pd.DataFrame: The first table found in the specified URL, parsed into
            a Pandas DataFrame.
    """
    url = f'https://bullishbears.com/{article}'
    return pd.read_html(url)[0]


def symbols_from_bullishbears_table(article):
    """
    Extracts stock symbols from a table in a Bullish Bears article.

    Args:
        article (str): The relative path of the article on the Bullish Bears
            website.

    Returns:
        list: A list of stock symbols from the table's "Symbol" column.

    Examples:
        >>> symbols = symbols_from_bullishbears_table('sp500-stocks-list')
        >>> len(symbols)
        503
        >>> symbols = symbols_from_bullishbears_table('dow-jones-stocks-list')
        >>> len(symbols)
        30
        >>> symbols = symbols_from_bullishbears_table('nasdaq-stocks-list')
        >>> len(symbols)
        100
        >>> symbols = symbols_from_bullishbears_table(
        ...     'russell-2000-stocks-list')
        >>> 'SMCI' in symbols
        True
        >>> len(symbols) > 1990
        True
    """
    df = table_from_bullishbears(article)
    symbols = list(df['Symbol'])
    return [s for s in symbols if isinstance(s, str)]


rut_tickers = functools.partial(
        symbols_from_bullishbears_table, 'russell-2000-stocks-list')

In [4]:
# @title From StatementDog Website

import requests
from bs4 import BeautifulSoup
import pandas as pd


def us_stock_table_from_statementdog():
    """
    Retrieves a list of stock symbols and their corresponding company names from
    the StatementDog US stock list page.

    The function sends a request to the StatementDog website, parses the HTML content,
    and extracts stock symbols and company names. The data is returned as a Pandas DataFrame
    with two columns: 'Symbol' and 'Name'.

    Returns:
        pd.DataFrame: A DataFrame containing stock symbols and company names.
                      Returns an empty DataFrame if the request fails.

    Example:
        >>> df = get_stock_symbols_and_names_from_statementdog()
        >>> df.head()
          Symbol                  Name
        0    AAPL          Apple Inc.
        1    MSFT       Microsoft Corp.
        2    TSLA  Tesla Inc.
        ...
    """
    # URL of the target page
    url = 'https://statementdog.com/us-stock-list'

    # Request the web page content
    response = requests.get(url)

    # Ensure the request was successful
    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return pd.DataFrame()  # Return an empty DataFrame on failure

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize lists to hold symbols and names
    symbols = []
    names = []

    # Find all <a> elements with the class 'us-stock-company'
    for a_tag in soup.find_all('a', class_='us-stock-company'):
        # Extract the symbol from the <span> element inside <a>
        symbol = a_tag.find('span', class_='us-stock-company-ticker').text

        # Extract the company name from the text of the <a> tag (excluding symbol part)
        name = a_tag.text.split(')')[-1].strip()

        # Append the symbol and name to the respective lists
        symbols.append(symbol)
        names.append(name)

    # Create a DataFrame with the extracted symbols and names
    stock_df = pd.DataFrame({
        'Symbol': symbols,
        'Name': names
    })

    return stock_df


def w5000_tickers():
    """
    Fetches all stock symbols from the StatementDog US stock list page,
    and returns them as a list.

    This function mimics the Wilshire 5000 Index by using the StatementDog website's
    US stock list as a proxy for all listed U.S. stocks. The list includes stocks
    from both the New York Stock Exchange (NYSE) and the NASDAQ, and serves as
    a comprehensive dataset of publicly traded companies in the U.S.

    Returns:
        list: A list of stock symbols from the StatementDog US stock list.
              Returns an empty list if the request fails.

    Example:
        >>> tickers = w5000_tickers()
        >>> len(tickers) > 5000
        True
        >>> 'AAPL' in tickers
        True
    """
    # URL of the target page
    url = 'https://statementdog.com/us-stock-list'

    # Request the web page content
    response = requests.get(url)

    # Ensure the request was successful
    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return []

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <span> elements with the class 'us-stock-company-ticker'
    stock_symbols = [span.text for span in soup.find_all('span', class_='us-stock-company-ticker')]

    return stock_symbols


In [34]:
# @title From MoneyDJ Website
import requests

def symbols_from_moneydj(index_name='DJI'):
    """
    Get a list of symbols from MoneyDJ for a given index.

    Args:
        index_name: str
            The name of the index. Posible values are:
            - DJI: Dow Jones Industrial Average
            - GSPC: S&P 500
            - SOXX: iShares Semiconductor
            - IWB: iShares Russell 1000
            - IWM: iShares Russell 2000
            - IWV: iShares Russell 3000
            - MDY: MidCap 400
            - IJR: iShares Core S&P Small-Cap
            - IYY: iShares U.S. Total Market
            - IYW: iShares U.S. Technology

    Returns:
        list: A list of stock symbols.
    Examples:
        >>> soxx = symbols_from_moneydj('SOXX')
        >>> len(soxx) == 30
        True
        >>> 'NVDA' in soxx
        True
        >>> 'AVGO' in soxx
        True
        >>> 'TSM' in soxx
        True
    """
    url = f'https://www.moneydj.com/us/rest/list0003a2/{index_name}'
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        stock_symbols = [item['id'] for item in data]

        return stock_symbols
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return []

#djia_tickers = functools.partial(symbols_from_moneydj, 'DJI')
gspc_tickers = functools.partial(symbols_from_moneydj, 'GSPC')
soxx_tickers = functools.partial(symbols_from_moneydj, 'SOXX')
iwb_tickers = functools.partial(symbols_from_moneydj, 'IWB')
iwm_tickers = functools.partial(symbols_from_moneydj, 'IWM')
iwv_tickers = functools.partial(symbols_from_moneydj, 'IWV')
iyy_tickers = functools.partial(symbols_from_moneydj, 'IYY')
iyw_tickers = functools.partial(symbols_from_moneydj, 'IYW')

In [5]:
# @title Manually keyed stock list

def sox_tickers():
    """
    Get a list of tickers for companies in the SOX (PHLX Semiconductor).

    This function returns a manually maintained list of SOX tickers.

    Note: This list may not be up-to-date and requires periodic updates.

    Returns:
        list: A list of SOX tickers.

    Examples:
        >>> tickers = sox_tickers()
        >>> len(tickers) == 30
        True
        >>> 'NVDA' in tickers
        True
        >>> 'AVGO' in tickers
        True
        >>> 'TSM' in tickers
        True
    """
    tickers = [
        'AMD', 'ADI', 'AMAT', 'ASML', 'AZTA', 'AVGO', 'COHR', 'ENTG', 'GFS',
        'INTC', 'IPGP', 'KLAC', 'LRCX', 'LSCC', 'MRVL', 'MCHP', 'MU', 'MPWR',
        'NOVT', 'NVDA', 'NXPI', 'ON', 'QRVO', 'QCOM', 'SWKS', 'SYNA', 'TSM',
        'TER', 'TXN', 'WOLF'
    ]
    return tickers


### Examples

In [20]:
print('DJIA:', djia_tickers())
print('NDX:', ndx_tickers())
print('SPX:', spx_tickers())
print('RUI:', rui_tickers())
print('RUT:', rut_tickers())
print('W5000', w5000_tickers())
print('SOX:', sox_tickers())

DJIA: ['MMM', 'AXP', 'AMGN', 'AMZN', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DIS', 'DOW', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'MCD', 'MRK', 'MSFT', 'NKE', 'PG', 'CRM', 'TRV', 'UNH', 'VZ', 'V', 'WMT']


  o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)


NDX: ['ADBE', 'AMD', 'ABNB', 'GOOGL', 'GOOG', 'AMZN', 'AEP', 'AMGN', 'ADI', 'ANSS', 'AAPL', 'AMAT', 'ARM', 'ASML', 'AZN', 'TEAM', 'ADSK', 'ADP', 'BKR', 'BIIB', 'BKNG', 'AVGO', 'CDNS', 'CDW', 'CHTR', 'CTAS', 'CSCO', 'CCEP', 'CTSH', 'CMCSA', 'CEG', 'CPRT', 'CSGP', 'COST', 'CRWD', 'CSX', 'DDOG', 'DXCM', 'FANG', 'DLTR', 'DASH', 'EA', 'EXC', 'FAST', 'FTNT', 'GEHC', 'GILD', 'GFS', 'HON', 'IDXX', 'ILMN', 'INTC', 'INTU', 'ISRG', 'KDP', 'KLAC', 'KHC', 'LRCX', 'LIN', 'LULU', 'MAR', 'MRVL', 'MELI', 'META', 'MCHP', 'MU', 'MSFT', 'MRNA', 'MDLZ', 'MDB', 'MNST', 'NFLX', 'NVDA', 'NXPI', 'ORLY', 'ODFL', 'ON', 'PCAR', 'PANW', 'PAYX', 'PYPL', 'PDD', 'PEP', 'QCOM', 'REGN', 'ROP', 'ROST', 'SBUX', 'SMCI', 'SNPS', 'TTWO', 'TMUS', 'TSLA', 'TXN', 'TTD', 'VRSK', 'VRTX', 'WBD', 'WDAY', 'XEL', 'ZS']
SPX: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 

In [32]:
s5 = set(w5000_tickers())
s1 = set(rui_tickers())
s2 = set(rut_tickers())
s3 = s1 | s2
s4 = set(spx_tickers()) | set(djia_tickers()) | set(ndx_tickers()) | set(sox_tickers())

s_soxx = set(soxx_tickers())
s_iwb = set(iwb_tickers())
s_iwm = set(iwm_tickers())
s_iwv = set(iwv_tickers())
s_iyy = set(iyy_tickers())
s_iyw = set(iyw_tickers())

In [49]:
s4 - s3

{'ARM', 'ASML', 'AZN', 'CCEP', 'MELI', 'NXPI', 'PDD', 'STX', 'TEL', 'TSM'}

In [33]:
print(len(s3))
print(len(s_soxx))
print(len(s_iwv))
print(len(s_iwm) + len(s_iwb))
print(len(s_iyy))

2917
30
2654
2968
1074


In [35]:
df = table_from_wikipedia('List_of_S%26P_500_companies')
github.upload_df_as_csv('sp500_Wikipedia.csv', df)
df = table_from_wikipedia('Dow_Jones_Industrial_Average')
github.upload_df_as_csv('djia_Wikipedia.csv', df)
df = table_from_wikipedia('Nasdaq-100')
github.upload_df_as_csv('ndx_Wikipedia.csv', df)
df = table_from_wikipedia('Russell_1000_Index', id=None)
github.upload_df_as_csv('rui_Wikipedia.csv', df)
df = table_from_bullishbears('sp500-stocks-list')
github.upload_df_as_csv('sp500_BullishBears.csv', df)
df = table_from_bullishbears('dow-jones-stocks-list')
github.upload_df_as_csv('djia_BullishBears.csv', df)
df = table_from_bullishbears('nasdaq-stocks-list')
github.upload_df_as_csv('ndx_BullishBears.csv', df)
df = table_from_bullishbears('russell-2000-stocks-list')
github.upload_df_as_csv('rui_BullishBears.csv', df)
df = us_stock_table_from_statementdog()
github.upload_df_as_csv('us_listed_stocks_StatementDog.csv', df)