# Importing Packages

In [1]:
# For data manipulation
import pandas as pd
import numpy as np

# To save files
import pickle

# For historical stock price data
import yfinance as yf

# Getting Historical Stock prices

In [2]:
def get_stock_price_data(ticker: str, start_date: str="2015-01-01", end_date: str="2021-11-30") -> pd.DataFrame:
    """
    Downloading stock prices from start_date to end_date for a given ticker, 
    and performing general preprocessing on the resulting dataframe. 
    
    INPUTS:
        :ticker (str): The ticker of the company/market index for which data needs to be downloaded.
        :start_date (str): Start date format: 'YYYY-MM-DD'
        :end_date (str): End date format: 'YYYY-MM-DD'
    
    OUTPUTS:
        :df (pd.DataFrame): Preprocessed adjusted closing stock prices corresponding to the input.
    """
    # Downloading data
    df = yf.download("FB", start=start_date, end=end_date)
    # Changing frequency of observations to Business days
    df = df.asfreq("B")
    # Only keeping the adjusted closing prices for each day
    df = df[["Adj Close"]]
    # Renaming the column to remove the space
    df = df.rename({"Adj Close": "Adj_Close"}, axis=1)
    # Filling NA values with a rolling mean of window 2
    df = df.fillna(df.rolling(2, min_periods=1).mean())
    # Returning the results
    return df

In [3]:
start_date, end_date = "2015-01-01", "2021-11-30"
fb_df = get_stock_price_data(ticker="FB", start_date=start_date, end_date=end_date)
aapl_df = get_stock_price_data(ticker="AAPL", start_date=start_date, end_date=end_date)
amzn_df = get_stock_price_data(ticker="AMZN", start_date=start_date, end_date=end_date)
nflx_df = get_stock_price_data(ticker="NFLX", start_date=start_date, end_date=end_date)
goog_df = get_stock_price_data(ticker="GOOG", start_date=start_date, end_date=end_date)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


# Loading BOW and TFIDF dictionaries

In [4]:
# BOW
# 10K
with open('bow_ten_ks.pickle', 'rb') as handle:
    bow_ten_ks = pickle.load(handle)
# 10Q
with open('bow_ten_qs.pickle', 'rb') as handle:
    bow_ten_qs = pickle.load(handle)

    
# TFIDF
# 10K
with open('tfidf_ten_ks.pickle', 'rb') as handle:
    tfidf_ten_ks = pickle.load(handle)
# 10Q  
with open('tfidf_ten_qs.pickle', 'rb') as handle:
    tfidf_ten_qs = pickle.load(handle)

# Combining BOW, and TFIDF with Historical data

In [5]:
def combining_sentiments(ticker_df: pd.DataFrame, ticker_sentiment: dict, sentiment_type: str) -> pd.DataFrame:
    """
    Adding sentiment scores from BOW or TFIDF dictionaries generated using the 10Q, and 10K forms.
    
    INPUTS:
        :ticker_df (pd.DataFrame): The historical stock prices for the ticker in question.
        :ticker_sentiment (dict): The sentiment dictionary for the ticker in question.
        :sentiment_type (str): Either 'bow', or 'tfidf'. 
                               Indicates from which dictionary the scores are being added.
        
    OUTPUTS:
        :(pd.DataFrame): A modified version of ticker_df, with all the sentiments scores added.
    """
    # Creating a dataframe from the ticker_sentiment dictionary.
    sentiment_df = pd.DataFrame(ticker_sentiment)
    sentiment_df.index = pd.to_datetime(sentiment_df.index, format="%Y%m%d")
    
    # Creating sentiment columns if ticker_df does not already have them already
    for col in sentiment_df.columns:
        col_name = sentiment_type + ":" + col
        if col_name not in ticker_df.columns:
            ticker_df[col_name] = np.nan
    
    # Adding the scores at the appropriate cells.
    for idx in sentiment_df.index:
        for col in sentiment_df.columns:
            col_name = sentiment_type + ":" + col
            ticker_df.loc[idx, col_name] = sentiment_df.loc[idx, col]
    
    # Returning the modified ticker_df dataframe.
    return ticker_df

In [6]:
# Copying the historical facebook data
fb_sentiment_df = fb_df.copy()

# Adding BOW sentiment scores from 10K forms
fb_sentiment_df = combining_sentiments(
    ticker_df=fb_sentiment_df, 
    ticker_sentiment=bow_ten_ks['fb'], 
    sentiment_type='bow'
)

# Adding BOW sentiment scores from 10Q forms
fb_sentiment_df = combining_sentiments(
    ticker_df=fb_sentiment_df, 
    ticker_sentiment=bow_ten_qs['fb'], 
    sentiment_type='bow'
)

# Adding TFIDF sentiment scores from 10K forms
fb_sentiment_df = combining_sentiments(
    ticker_df=fb_sentiment_df, 
    ticker_sentiment=bow_ten_ks['fb'], 
    sentiment_type='tfidf'
)

# Adding TFIDF sentiment scores from 10Q forms
fb_sentiment_df = combining_sentiments(
    ticker_df=fb_sentiment_df, 
    ticker_sentiment=bow_ten_qs['fb'], 
    sentiment_type='tfidf'
)

# Cleaning all Nulls
fb_sentiment_df = fb_sentiment_df.fillna(method='ffill').fillna(method='bfill')

In [7]:
# Copying the historical apple data
aapl_sentiment_df = aapl_df.copy()

# Adding BOW sentiment scores from 10K forms
aapl_sentiment_df = combining_sentiments(
    ticker_df=aapl_sentiment_df, 
    ticker_sentiment=bow_ten_ks['aapl'], 
    sentiment_type='bow'
)

# Adding BOW sentiment scores from 10Q forms
aapl_sentiment_df = combining_sentiments(
    ticker_df=aapl_sentiment_df, 
    ticker_sentiment=bow_ten_qs['aapl'], 
    sentiment_type='bow'
)

# Adding TFIDF sentiment scores from 10K forms
aapl_sentiment_df = combining_sentiments(
    ticker_df=aapl_sentiment_df, 
    ticker_sentiment=bow_ten_ks['aapl'], 
    sentiment_type='tfidf'
)

# Adding TFIDF sentiment scores from 10Q forms
aapl_sentiment_df = combining_sentiments(
    ticker_df=aapl_sentiment_df, 
    ticker_sentiment=bow_ten_qs['aapl'], 
    sentiment_type='tfidf'
)

# Cleaning all Nulls
aapl_sentiment_df = aapl_sentiment_df.fillna(method='ffill').fillna(method='bfill')

In [8]:
# Copying the historical amazon data
amzn_sentiment_df = amzn_df.copy()

# Adding BOW sentiment scores from 10K forms
amzn_sentiment_df = combining_sentiments(
    ticker_df=amzn_sentiment_df, 
    ticker_sentiment=bow_ten_ks['amzn'], 
    sentiment_type='bow'
)

# Adding BOW sentiment scores from 10Q forms
amzn_sentiment_df = combining_sentiments(
    ticker_df=amzn_sentiment_df, 
    ticker_sentiment=bow_ten_qs['amzn'], 
    sentiment_type='bow'
)

# Adding TFIDF sentiment scores from 10K forms
amzn_sentiment_df = combining_sentiments(
    ticker_df=amzn_sentiment_df, 
    ticker_sentiment=bow_ten_ks['amzn'], 
    sentiment_type='tfidf'
)

# Adding TFIDF sentiment scores from 10Q forms
amzn_sentiment_df = combining_sentiments(
    ticker_df=amzn_sentiment_df, 
    ticker_sentiment=bow_ten_qs['amzn'], 
    sentiment_type='tfidf'
)

# Cleaning all Nulls
amzn_sentiment_df = amzn_sentiment_df.fillna(method='ffill').fillna(method='bfill')

In [9]:
# Copying the historical netflix data
nflx_sentiment_df = nflx_df.copy()

# Adding BOW sentiment scores from 10K forms
nflx_sentiment_df = combining_sentiments(
    ticker_df=nflx_sentiment_df, 
    ticker_sentiment=bow_ten_ks['nflx'], 
    sentiment_type='bow'
)

# Adding BOW sentiment scores from 10Q forms
nflx_sentiment_df = combining_sentiments(
    ticker_df=nflx_sentiment_df, 
    ticker_sentiment=bow_ten_qs['nflx'], 
    sentiment_type='bow'
)

# Adding TFIDF sentiment scores from 10K forms
nflx_sentiment_df = combining_sentiments(
    ticker_df=nflx_sentiment_df, 
    ticker_sentiment=bow_ten_ks['nflx'], 
    sentiment_type='tfidf'
)

# Adding TFIDF sentiment scores from 10Q forms
nflx_sentiment_df = combining_sentiments(
    ticker_df=nflx_sentiment_df, 
    ticker_sentiment=bow_ten_qs['nflx'], 
    sentiment_type='tfidf'
)

# Cleaning all Nulls
nflx_sentiment_df = nflx_sentiment_df.fillna(method='ffill').fillna(method='bfill')

In [10]:
# Copying the historical amazon data
goog_sentiment_df = goog_df.copy()

# Adding BOW sentiment scores from 10K forms
goog_sentiment_df = combining_sentiments(
    ticker_df=goog_sentiment_df, 
    ticker_sentiment=bow_ten_ks['goog'], 
    sentiment_type='bow'
)

# Adding BOW sentiment scores from 10Q forms
goog_sentiment_df = combining_sentiments(
    ticker_df=goog_sentiment_df, 
    ticker_sentiment=bow_ten_qs['goog'], 
    sentiment_type='bow'
)

# Adding TFIDF sentiment scores from 10K forms
goog_sentiment_df = combining_sentiments(
    ticker_df=goog_sentiment_df, 
    ticker_sentiment=bow_ten_ks['goog'], 
    sentiment_type='tfidf'
)

# Adding TFIDF sentiment scores from 10Q forms
goog_sentiment_df = combining_sentiments(
    ticker_df=goog_sentiment_df, 
    ticker_sentiment=bow_ten_qs['goog'], 
    sentiment_type='tfidf'
)

# Cleaning all Nulls
goog_sentiment_df = goog_sentiment_df.fillna(method='ffill').fillna(method='bfill')

# Saving all the modified dataframes

In [11]:
fb_sentiment_df.to_csv("fb_sentiment_df.csv", index=True)
aapl_sentiment_df.to_csv("aapl_sentiment_df.csv", index=True)
amzn_sentiment_df.to_csv("amzn_sentiment_df.csv", index=True)
nflx_sentiment_df.to_csv("nflx_sentiment_df.csv", index=True)
goog_sentiment_df.to_csv("goog_sentiment_df.csv", index=True)