<a href="https://colab.research.google.com/github/themoonwalker1/quantcap-options/blob/main/options_data_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
# Quant Cap Options Trading 23/24

In [91]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import httpx
import csv
import io
import yfinance as yf
from dateutil.relativedelta import relativedelta

In [89]:
class GatherData():
    
    def get_historical_data(self, params):
        """ Pulls EOD Options, EOD Greeks, and Open Interest data from ThetaData API and returns it as a DataFrame
         
        Retrieves data pertaining to the given parameters (params) from the 
        historical option EOD, historical Greeks EOD, and Open Interest endpoints.

        Args:
            params: A dictionary of query parameters

        Returns:
            df: A combined Pandas DataFrame of historical option EOD, historical Greeks EOD, and Open Interest data

        Raises:
            HTTPError: If the API request fails.
            ValueError: If the response data is not in the expected format.
        """
        
        BASE_URL = "http://127.0.0.1:25510/v2"

        params = {
            "root": "AAPL",
            "exp": "20250117",
            "right": "C",
            "strike": "225000",
            "start_date": "20241107",
            "end_date": "20241112",
            "use_csv": "true",
            "ivl": "60000",
        }

        # Historical Data Options EOD

        urleod = BASE_URL + '/hist/option/eod'

        all_data = []
        while urleod is not None:
            with httpx.stream("GET", urleod, params=params) as response:
                response.raise_for_status()

                for line in response.iter_lines():
                    if line:
                        all_data.append(line)
                    #   data = json.loads(line)
                    #   all_data.append(data)

            if 'Next-Page' in response.headers and response.headers['Next-Page'] != "null":
                urleod = response.headers['Next-Page']

            else:
                urleod = None

        csv_data = "\n".join(all_data)
         
        df_eod = pd.read_csv(io.StringIO(csv_data))



        # Historical Data Options Greeks
        
        # urlgreek = BASE_URL + 



        # Historical Data Options Open Interest

        # urloi = BASE_URL + 


        return df_eod
        


    def get_sentiment_data(self, symbols: list[str], start_date: str, end_date: str, interval: str = "1d") -> pd.DataFrame:
        """
        Fetch sentiment data from StockGeist API and return it as a DataFrame.

        This function retrieves sentiment data for a specified asset class and
        location from the StockGeist API. The data includes both message and
        article sentiment metrics for a given symbol over a specified date range.
        Reference: https://docs.stockgeist.ai/

        Args:
            symbols: A list of strings representing the tickers to fetch sentiment data for.
                eg: ["AAPL", "MSFT", "GOOGL"]
            start_date: A string representing the start date for the sentiment data.
            end_date: A string representing the end date for the sentiment data.
            interval: A string representing the interval for the sentiment data.
        Returns:
            DataFrame: A pandas DataFrame containing the sentiment data.

        Raises:
            HTTPError: If the API request fails.
            ValueError: If the response data is not in the expected format.
        """

        # constants
        EM_WEIGHT = 1.5 # weight of emotional sentiment
        NEM_WEIGHT = 1.0 # weight of non-emotional sentiment

        MESSAGE_WEIGHT = 0.3
        ARTICLE_WEIGHT = 0.7

        # API Key
        STOCKGEIST_API_KEY = 'oElOiXrwLtDY2flizauAIrrpivXU0bUQ'
        headers = {"token": STOCKGEIST_API_KEY}

        # API endpoint
        base_url = "https://api.stockgeist.ai"
        asset_class = "stock"  # or "crypto"
        location = "us"  # or "global"

        # Messages sentiment data
        messages_url = f"{base_url}/{asset_class}/{location}/hist/message-metrics"
        messages_params = {
            "symbols": ','.join(symbols),
            "start": start_date,
            "end": end_date,
            "timeframe": interval
        }
        
        # response = requests.get(messages_url, headers=headers, params=messages_params)
        # if response.status_code != 200:
        #     raise requests.HTTPError(f"{response.status_code}: {response.text}")
        
        # messages_data = response.json().get("data", {})
        messages_data = json.load(open("messages.json")).get("data", {})

        messages_date_rows = []
        for symbol in symbols: # same stocks in symbols
            stock_data = messages_data.get(symbol, [])
            for day_data in stock_data:
                date = datetime.fromisoformat(day_data.get("timestamp")).date() # date only
                pos_em_count = day_data.get("pos_em_count", 0)
                pos_nem_count = day_data.get("pos_nem_count", 0)
                neu_em_count = day_data.get("neu_em_count", 0)
                neu_nem_count = day_data.get("neu_nem_count", 0)
                neg_em_count = day_data.get("neg_em_count", 0)
                neg_nem_count = day_data.get("neg_nem_count", 0)
                em_total_count = day_data.get("em_total_count", 0)
                nem_total_count = day_data.get("nem_total_count", 0)
                pos_total_count = day_data.get("pos_total_count", 0)
                neu_total_count = day_data.get("neu_total_count", 0)
                neg_total_count = day_data.get("neg_total_count", 0)
                total_count = day_data.get("total_count", 0)

                # calculate message sentiment using custom formula
                #normalized: 0 to 1
                message_sentiment = EM_WEIGHT * (pos_em_count + neg_em_count) / em_total_count + NEM_WEIGHT * (pos_nem_count + neg_nem_count) / nem_total_count
                messages_date_rows.append([symbol, date, message_sentiment])

        messages_df = pd.DataFrame(messages_date_rows, columns=["symbol", "date", "message_sentiment"])

        # Articles sentiment data
        articles_url = f"{base_url}/{asset_class}/{location}/hist/article-metrics"
        articles_params = {
            "symbols": ','.join(symbols),
            "start": start_date,
            "end": end_date,
            "timeframe": interval,
            "max_symbol_articles": 200,
            "sort_by": "timestamp"
        }
        # response = requests.get(articles_url, headers=headers, params=articles_params)
        # if response.status_code != 200:
        #     raise requests.HTTPError(f"{response.status_code}: {response.text}")
        
        # articles_data = response.json().get("data", {})

        articles_data = json.load(open("articles.json")).get("data", {})

        articles_date_rows = []
        for symbol in symbols:
            stock_data = articles_data.get(symbol, [])  
            for day_data in stock_data:
                date = datetime.fromisoformat(day_data.get("timestamp")).date() # date only
                mentions = day_data.get("mentions", 0)
                title_sentiment = day_data.get("title_sentiment", "neutral")
                sentiment_map = {"neutral": 0, "positive": 1, "negative": -1} # # neutral, positive, negative
                title_sentiment = sentiment_map.get(title_sentiment, 0)
                # ignore title, summary, original_url, img_url, sentiment_spans

                # calculate article sentiment using custom formula
                article_sentiment = title_sentiment * mentions  

                articles_date_rows.append([symbol, date, article_sentiment])

        # symbol/date pair should be unique
        articles_df = pd.DataFrame(articles_date_rows, columns=["symbol", "date", "article_sentiment"]).groupby(['symbol', 'date'], as_index=False)['article_sentiment'].sum()


        # combine messages and articles dataframes
        sentiment_df = pd.merge(messages_df, articles_df, on=["symbol", "date"], how="outer", )
        # sentiment_df.set_index(["symbol", "date"], inplace=True)

        # calculate weighted sentiment
        sentiment_df["weighted_sentiment"] = MESSAGE_WEIGHT * sentiment_df["message_sentiment"] + ARTICLE_WEIGHT * sentiment_df["article_sentiment"]

        return sentiment_df


    def get_fundamental_data(self, ticker: str, total_years: int):
        """ 
        Pulls historical stock data from Tiingo using its API and 
        preprocesses it by getting the percentage stock increase per 
        week from the raw EOD close data

        Args: 
            ticker: A string with the specified stock ticker
            total_years: How many years back to get the data from the current date. 
            Must be less than 30. 
        Returns: 
            Pandas Dataframe with two columns: the date and the percentage stock increase a week out into the future. 
            The date is an integer in the form YYYY-MM-DD, and the percentage stock increase is a float. 
        Raises: 
            ValueError if the total_years inputted are greater than 30. 
        """
        if total_years > 30:
            raise ValueError("Total years are greater than 30")

        TIINGO_API_KEY = '95b8e93dadad1cceda98479bc2420f9a0bb5556a'
        base_url = "https://api.tiingo.com/tiingo/daily"
        curr_date = date.today()
        start_date = date(curr_date) + relativedelta(years=total_years)
        str_end_date = str(curr_date)
        str_start_date = str(start_date)
        ticker = ticker.lower()
        url = f"{base_url}/{ticker}/prices?startDate={str_start_date}&endDate={str_end_date}&token={TIINGO_API_KEY}"
        headers = {
            'Content-Type': 'application/json'
        }
        requestResponse = requests.get("https://api.tiingo.com/tiingo/daily/aapl/prices?startDate=2023-12-02&token=95b8e93dadad1cceda98479bc2420f9a0bb5556a", headers=headers)
        requestResponse = requestResponse.json()
        size = len(requestResponse)
        to_return = {"date": [None] * size, "pct_change": np.ndarray((size))}
        for i in range(len(requestResponse) - 7):
            entry = requestResponse[i]
            date = entry["date"]
            end_date = datetime.fromisoformat(date[:-1] + '+00:00') + relativedelta(days=7)
            end_date = end_date.isoformat()[:-6] + '.000Z'
            match_close = [a['close'] for a in requestResponse if a['date']== end_date]
            if len(match_close) != 0:
                day = date[:10]
                close_1 = entry['close']
                close_2 = match_close[0]
                pct_change = (close_2 - close_1) / close_1 * 100
                to_return["date"][i] = day
                to_return["pct_change"][i] = pct_change
        indices_of_none_dates = [x for x in range(len(to_return["date"])) if to_return["date"][x] is not None]
        to_return["date"] = [to_return["date"][x] for x in indices_of_none_dates]
        to_return["pct_change"] = [to_return["pct_change"][x] for x in indices_of_none_dates]
        to_return = pd.DataFrame.from_dict(to_return)
        return to_return


    def get_liquidity_data(self):
        """ Pulls data from NASDAQ and returns a dataframe """
        pass

    def get_earnings_data(self, symbols: list[str], start_date: str, end_date: str):
        """
        Fetch earnings data from Yahoo Calendar and return it as a DataFrame.

        This function retrieves earnings data for a specified list of tickers
        from the Yahoo Calendar API. The data includes earnings dates and other
        relevant financial information for the given tickers over a specified date range.
        Reference:  https://pypi.org/project/yfinance/
                    https://ranaroussi.github.io/yfinance/index.html    

        Args:
            symbols: A list of strings representing the tickers to fetch earnings data for.
                eg: ["AAPL", "MSFT", "GOOGL"]
            start_date: A string representing the start date for the earnings data.
            end_date: A string representing the end date for the earnings data.

        Returns:
            DataFrame: A pandas DataFrame containing the earnings data.

        Raises:
            HTTPError: If the API request fails.
            ValueError: If the response data is not in the expected format.
        """
        
        earnings_df = pd.DataFrame(columns=["symbol", "earnings_date", "estimated_eps", "reported_eps"])
        for symbol in symbols:
            ticker = yf.Ticker(symbol)
            earning_dates_df = ticker.earnings_dates
            earning_dates_df.drop(columns=["Surprise(%)"], inplace=True)
            earning_dates_df.reset_index(drop=False, inplace=True)
            earning_dates_df.columns= ["earnings_date", "estimated_eps", "reported_eps"]
            earning_dates_df["symbol"] = symbol
            earning_dates_df["earnings_date"] = earning_dates_df["earnings_date"].dt.date
            earnings_df = pd.concat([earnings_df, earning_dates_df], ignore_index=True)
            earnings_df.dropna(axis=1, how='all', inplace=True)
        return earnings_df

    def get_institutional_data(self):
        """ Pulls data from 13F-Form Dataset and returns a dataframe"""
        pass

    def json_to_dtf(self):
        """ Parses through response and constructs and returns a dataframe"""
        pass

In [None]:
# Testing

analyzer = GatherData()
sentiment_df = analyzer.get_sentiment_data(["AAPL", "GOOGL"], "2024-01-01", "2024-01-02", "1d")
print(sentiment_df.head())
earnings_df = analyzer.get_earnings_data(["AAPL", "GOOGL"], "2024-01-01", "2024-01-02")
print(earnings_df.head())


In [24]:


class ProcessData():
    def join_all_datasets(self, arr, methods):
        """
        joins all of the datasets in an array of dataframes using the
         specified methods in the string array of methods.
         Methods include full, left, right, inner"""
        pass

    """ handles missing NaN values using an approach passed in as a string variable named approach for dataset d. Approaches allowed include one-hot encoding, dropping all rows with NaN, etc."""

    def handle_missing_values(self, approach, dataset):
        pass
    """ Normalize features of dataframe x using sklearn methods using an approach passed in as a string variable approach"""

    def normalize_features(self, x, approach):
        pass

    """ Select target features of x, where the features are given in an array feature list"""

    def select_features(self, dataset, feature_list):
        pass
    """ Splits data into training, validation, and testing """

    def split_data(self, dataset, column, test_size, val_size):
        pass
    """ Deletes rows of data containing outliers above a certain threshold from dataset """

    def handle_outliers(self, dataset, threshold):
        pass