<a href="https://colab.research.google.com/github/themoonwalker1/quantcap-options/blob/main/options_data_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Quant Cap Options Trading 23/24

In [27]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import httpx
import csv
import io

In [29]:
class GatherData():
    
    def get_historical_data(self, params):
        """ Pulls EOD Options, EOD Greeks, and Open Interest data from ThetaData API and returns it as a DataFrame
         
        Retrieves data pertaining to the given parameters (params) from the 
        historical option EOD, historical Greeks EOD, and Open Interest endpoints.

        Args:
            params: A dictionary of query parameters

        Returns:
            df: A combined Pandas DataFrame of historical option EOD, historical Greeks EOD, and Open Interest data

        Raises:
            HTTPError: If the API request fails.
            ValueError: If the response data is not in the expected format.
        """
        
        BASE_URL = "http://127.0.0.1:25510/v2"

        params = {
            "root": "AAPL",
            "exp": "20250117",
            "right": "C",
            "strike": "225000",
            "start_date": "20241107",
            "end_date": "20241112",
            "use_csv": "true",
            "ivl": "60000",
        }

        # Historical Data Options EOD

        urleod = BASE_URL + '/hist/option/eod'

        all_data = []
        while urleod is not None:
            with httpx.stream("GET", urleod, params=params) as response:
                response.raise_for_status()

                for line in response.iter_lines():
                    if line:
                        all_data.append(line)
                    #   data = json.loads(line)
                    #   all_data.append(data)

            if 'Next-Page' in response.headers and response.headers['Next-Page'] != "null":
                urleod = response.headers['Next-Page']

            else:
                urleod = None

        csv_data = "\n".join(all_data)
         
        df_eod = pd.read_csv(io.StringIO(csv_data))



        # Historical Data Options Greeks
        
        urlgreek = BASE_URL + '/bulk_hist/option/eod_greeks'
        csv_greeks = ''

        while url is not None:
          response = httpx.get(urlgreek, params=params)
          response.raise_for_status()
          if 'Next-Page' in response.headers and response.headers['Next=Page'] != "null":
            url = resp.headers['Next-Page']
          else:
            url = None
          csv_greeks = csv_greeks + "/n" + response.text



        # Historical Data Options Open Interest

        urloi = BASE_URL + 'bulk_hist/option/open_interest'
        csv_oi = ''

        while url is not None:
          response = httpx.get(urloi, params=params)
          response.raise_for_status()
          if 'Next-Page' in response.headers and response.headers['Next=Page'] != "null":
            url = resp.headers['Next-Page']
          else:
            url = None
          csv_oi = csv_oi + "/n" + response.text


        return df_eod
        


    def get_sentiment_data(self, symbols: list[str], start_date: str, end_date: str, interval: str = "1d") -> pd.DataFrame:
        """
        Fetch sentiment data from StockGeist API and return it as a DataFrame.

        This function retrieves sentiment data for a specified asset class and
        location from the StockGeist API. The data includes both message and
        article sentiment metrics for a given symbol over a specified date range.

        Args:
            symbols: A list of strings representing the tickers to fetch sentiment data for.
                eg: ["AAPL", "MSFT", "GOOGL"]
            start_date: A string representing the start date for the sentiment data.
            end_date: A string representing the end date for the sentiment data.
            interval: A string representing the interval for the sentiment data.
        Returns:
            DataFrame: A pandas DataFrame containing the sentiment data.

        Raises:
            HTTPError: If the API request fails.
            ValueError: If the response data is not in the expected format.
        """

        # constants
        EM_WEIGHT = 1.5 # weight of emotional sentiment
        NEM_WEIGHT = 1.0 # weight of non-emotional sentiment

        MESSAGE_WEIGHT = 0.3
        ARTICLE_WEIGHT = 0.7

        # API Key
        STOCKGEIST_API_KEY = ''
        headers = {"token": STOCKGEIST_API_KEY}

        # API endpoint
        base_url = "https://api.stockgeist.ai"
        asset_class = "stocks"  # or "crypto"
        location = "us"  # or "global"

        # Messages sentiment data
        messages_url = f"{base_url}/{asset_class}/{location}/hist/message-metrics"
        messages_params = {
            "symbols": ','.join(symbols),
            "start": start_date,
            "end": end_date,
            "timeframe": interval
        }
        
        response = requests.get(messages_url, headers=headers, params=messages_params)
        if response.status_code != 200:
            raise requests.HTTPError(f"Failed to fetch messages sentiment data: {response.status_code}")
        
        messages_data = response.json().get("data", [])

        messages_df = pd.DataFrame(columns=["symbol", "date", "message_sentiment"])

        for symbol in symbols: # same stocks in symbols
            stock_data = messages_data.get(symbol, [])
            row = [symbol]
            for day_data in stock_data:
                date = day_data.get("date", datetime.now().strftime("%Y-%m-%d"))
                row.append(date)
                pos_em_count = day_data.get("pos_em_count", 0)
                pos_nem_count = day_data.get("pos_nem_count", 0)
                neu_em_count = day_data.get("neu_em_count", 0)
                neu_nem_count = day_data.get("neu_nem_count", 0)
                neg_em_count = day_data.get("neg_em_count", 0)
                neg_nem_count = day_data.get("neg_nem_count", 0)
                em_total_count = day_data.get("em_total_count", 0)
                nem_total_count = day_data.get("nem_total_count", 0)
                pos_total_count = day_data.get("pos_total_count", 0)
                neu_total_count = day_data.get("neu_total_count", 0)
                neg_total_count = day_data.get("neg_total_count", 0)
                total_count = day_data.get("total_count", 0)

                # calculate message sentiment using custom formula
                #normalized: 0 to 1
                message_sentiment = EM_WEIGHT * (pos_em_count + neg_em_count) / em_total_count + NEM_WEIGHT * (pos_nem_count + neg_nem_count) / nem_total_count
                row.append(message_sentiment)
            messages_df = messages_df.append(row, ignore_index=True)

        # Articles sentiment data
        articles_url = f"{base_url}/{asset_class}/{location}/hist/article-metrics"
        articles_params = {
            "symbols": ','.join(symbols),
            "start": start_date,
            "end": end_date,
            "timeframe": interval,
            "max_symbol_articles": 200,
            "sort_by": "timestamp"
        }
        response = requests.get(articles_url, headers=headers, params=articles_params)
        if response.status_code != 200:
            raise requests.HTTPError(f"Failed to fetch articles sentiment data: {response.status_code}")
        
        articles_data = response.json().get("data", [])

        articles_df = pd.DataFrame(columns=["symbol", "date", "article_sentiment"])

        
        for symbol in symbols:
            stock_data = articles_data.get(symbol, [])  
            row = [symbol]
            for day_data in stock_data:
                date = day_data.get("timestamp", datetime.now().strftime("%Y-%m-%d"))
                row.append(date)
                mentions = day_data.get("mentions", 0)
                title_sentiment = day_data.get("title_sentiment", "neutral")
                sentiment_map = {"neutral": 0, "positive": 1, "negative": -1} # # neutral, positive, negative
                title_sentiment = sentiment_map.get(title_sentiment, 0)
                # ignore title, summary, original_url, img_url, sentiment_spans

                # calculate article sentiment using custom formula

                article_sentiment = title_sentiment * mentions  
                row.append(article_sentiment)

            # check if a row exist for articles of this date for this symbol
            date = row[1]
            existing_row = articles_df[(articles_df['symbol'] == symbol) & (articles_df['date'] == date)]
            if existing_row.empty: 
                articles_df = articles_df.append(row, ignore_index=True)
            else: 
                articles_df.loc[(articles_df['symbol'] == symbol) & (articles_df['date'] == date), 'article_sentiment'] += article_sentiment

        # combine messages and articles dataframes
        sentiment_df = pd.merge([messages_df, articles_df], on=["symbol", "date"], how="outer")
        sentiment_df.set_index(["symbol", "date"], inplace=True)

        # calculate weighted sentiment
        sentiment_df["weighted_sentiment"] = MESSAGE_WEIGHT * sentiment_df["message_sentiment"] + ARTICLE_WEIGHT * sentiment_df["article_sentiment"]

        return sentiment_df


    def get_fundamental_data(self):
        """ Pulls data from Tingo and returns a dataframe"""
        pass

    """ Pulls data from NASDAQ and returns a dataframe """

    def get_liquidity_data(self):
        pass
    """ Pulls data from Yahoo Calendar and returns a dataframe"""

    def get_earnings_data(self):
        pass
    """ Pulls data from 13F-Form Dataset and returns a dataframe"""

    def get_institutional_data(self):
        pass
    """ Parses through response and constructs and returns a dataframe"""

    def json_to_dtf(self):
        pass

   ms_of_day  ms_of_day2   open   high   low  close  volume  count  bid_size  \
0   62488618    57597806   9.35  10.50  9.13  10.30    2215    432         1   
1   62261982    57572197  10.20  10.95  9.50   9.96     640    198         7   
2   62559535    57580493   8.68   8.95  6.95   8.35    4426    682        11   
3   62498880    57574993   8.55   8.95  7.85   8.20    1211    423        10   

   bid_exchange   bid  bid_condition  ask_size  ask_exchange    ask  \
0             1  9.40             50        32             7  10.50   
1            47  9.90             50        97             4  10.05   
2            76  8.30             50        70            76   8.40   
3            60  8.15             50        90            76   8.30   

   ask_condition      date  
0             50  20241107  
1             50  20241108  
2             50  20241111  
3             50  20241112  


In [24]:


class ProcessData():
    def join_all_datasets(self, arr, methods):
        """
        joins all of the datasets in an array of dataframes using the
         specified methods in the string array of methods.
         Methods include full, left, right, inner"""
        pass

    """ handles missing NaN values using an approach passed in as a string variable named approach for dataset d. Approaches allowed include one-hot encoding, dropping all rows with NaN, etc."""

    def handle_missing_values(self, approach, dataset):
        pass
    """ Normalize features of dataframe x using sklearn methods using an approach passed in as a string variable approach"""

    def normalize_features(self, x, approach):
        pass

    """ Select target features of x, where the features are given in an array feature list"""

    def select_features(self, dataset, feature_list):
        pass
    """ Splits data into training, validation, and testing """

    def split_data(self, dataset, column, test_size, val_size):
        pass
    """ Deletes rows of data containing outliers above a certain threshold from dataset """

    def handle_outliers(self, dataset, threshold):
        pass

In [None]:
print("testing")
print("hi")

testing
hi


In [None]:
def fetch_smalltable_rows(
    table_handle: smalltable.Table,
    keys: Sequence[bytes | str],
    require_all_keys: bool = False,
) -> Mapping[bytes, tuple[str, ...]]:
    """Fetches rows from a Smalltable.

    Retrieves rows pertaining to the given keys from the Table instance
    represented by table_handle.  String keys will be UTF-8 encoded.

    Args:
        table_handle: An open smalltable.Table instance.
        keys: A sequence of strings representing the key of each table
          row to fetch.  String keys will be UTF-8 encoded.
        require_all_keys: If True only rows with values set for all keys will be
          returned.

    Returns:
        A dict mapping keys to the corresponding table row data
        fetched. Each row is represented as a tuple of strings. For
        example:

        {b'Serak': ('Rigel VII', 'Preparer'),
         b'Zim': ('Irk', 'Invader'),
         b'Lrrr': ('Omicron Persei 8', 'Emperor')}

        Returned keys are always bytes.  If a key from the keys argument is
        missing from the dictionary, then that row was not found in the
        table (and require_all_keys must have been False).

    Raises:
        IOError: An error occurred accessing the smalltable.
    """