# Importing Packages

In [1]:
# To perform basic text preprocessing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# To save files
import pickle

# For tracking progress in loops
from tqdm import tqdm

# For BOW and TFIDF
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /Users/yashv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yashv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yashv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Helper Functions

In [2]:
def lemmatize_words(words: list) -> list:
    """
    Takes a list of words, and returns them after lemmatizing each word.
    
    INPUTS:
        :words (list): List of input words.
    
    OUTPUTS:
        :(list): List of words after lemmatizing each input word.
    """
    return [WordNetLemmatizer().lemmatize(word, 'v') for word in words]

In [3]:
def get_bag_of_words(sentiment_words: list, docs: list) -> np.ndarray:
    """
    Get Bag-of-words representation for the input documents.
    
    INPUTS:
        :sentiment_words (list): List of words characterizing a sentiment.
        :docs (list): List of strings. Each string is a different 10K/Q form.
    
    OUTPUTS:
        :bag_of_words (np.ndarray): shape - (#docs, #<unique words in all docs>)
    """
    vec = CountVectorizer(vocabulary=sentiment_words)
    vectors = vec.fit_transform(docs)
    words_list = vec.get_feature_names()
    bag_of_words = np.zeros([len(docs), len(words_list)])
    
    for doc_idx in range(len(docs)):
        bag_of_words[doc_idx] = vectors[doc_idx].toarray()[0]

    return bag_of_words.astype(int)

In [4]:
def get_tfidf(sentiment_words: list, docs: list) -> np.ndarray:
    """
    Get TF-IDF representation for the input documents.
    
    INPUTS:
        :sentiment_words (list): List of words characterizing a sentiment.
        :docs (list): List of strings. Each string is a different 10K/Q form.
    
    OUTPUTS:
        :tfidf (np.ndarray): shape - (#docs, #<unique words in all docs>)
    """
    vec = TfidfVectorizer(vocabulary=sentiment_words)
    tfidf = vec.fit_transform(docs)
    
    return tfidf.toarray()

# Preparing Sentiment Data Frame

In [5]:
sentiments = ["negative", "positive", "uncertainty", 
              "litigious", "strong_modal", "weak_modal", 
              "constraining", "complexity"]

In [6]:
sentiment_df = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv")
sentiment_df.columns = [column.lower() for column in sentiment_df.columns]
sentiment_df = sentiment_df[sentiment_df['word'].notna()]

sentiment_df['word'] = lemmatize_words(sentiment_df['word'].str.lower())
sentiment_df = sentiment_df.drop_duplicates('word')
print(sentiment_df.shape)
sentiment_df.head()

(66289, 17)


Unnamed: 0,word,seq_num,word count,word proportion,average proportion,std dev,doc count,negative,positive,uncertainty,litigious,strong_modal,weak_modal,constraining,complexity,syllables,source
0,aardvark,1,312,1.42205e-08,1.335201e-08,3.700747e-06,96,0,0,0,0,0,0,0,0,2,12of12inf
1,aardvarks,2,3,1.367356e-10,8.882163e-12,9.362849e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,abaci,3,9,4.102067e-10,1.200533e-10,5.359747e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,aback,4,15,6.836779e-10,4.080549e-10,1.406914e-07,14,0,0,0,0,0,0,0,0,2,12of12inf
4,abacus,5,8009,3.650384e-07,3.798698e-07,3.523914e-05,1058,0,0,0,0,0,0,0,0,3,12of12inf


# Loading 10K and 10Q dictionaries 

In [7]:
with open('ten_q_forms.pickle', 'rb') as handle:
    ten_q_forms = pickle.load(handle)

with open('ten_q_forms_dates.pickle', 'rb') as handle:
    ten_q_forms_dates = pickle.load(handle)

with open('ten_k_forms.pickle', 'rb') as handle:
    ten_k_forms = pickle.load(handle)

with open('ten_k_forms_dates.pickle', 'rb') as handle:
    ten_k_forms_dates = pickle.load(handle)

# Bag-of-Words

## 10Q

In [8]:
sentiment_bow_ten_qs = {}

for ticker, ten_qs in tqdm(ten_q_forms.items(), total=len(ten_q_forms.keys())):
    lemma_docs = [' '.join(ten_q) for ten_q in ten_qs]
    
    sentiment_bow_ten_qs[ticker] = {
        sentiment: get_bag_of_words(
            sentiment_df[sentiment_df[sentiment]!=0]['word'], 
            lemma_docs
        )
        for sentiment in sentiments
    }

100%|█████████████████████████████████████████████| 5/5 [02:49<00:00, 33.87s/it]


In [9]:
mag_bow = lambda bow: np.linalg.norm(bow, ord=2, axis=1)/bow.shape[1]

bow_ten_qs = {}
for ticker, sentiments in tqdm(sentiment_bow_ten_qs.items(), total=5):
    dates = ten_q_forms_dates[ticker]
    bow_ten_qs[ticker] = {
        sentiment: {
            date: mag_bow(bow)[date_idx] 
            for date_idx, date in enumerate(dates)
        } for sentiment, bow in sentiments.items()
    }

100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 400.49it/s]


## 10K

In [10]:
sentiment_bow_ten_ks = {}

for ticker, ten_ks in tqdm(ten_k_forms.items(), total=len(ten_k_forms.keys())):
    lemma_docs = [' '.join(ten_k) for ten_k in ten_ks]
    
    sentiment_bow_ten_ks[ticker] = {
        sentiment: get_bag_of_words(
            sentiment_df[sentiment_df[sentiment]!=0]['word'], 
            lemma_docs
        )
        for sentiment in sentiments
    }

100%|█████████████████████████████████████████████| 5/5 [01:45<00:00, 21.09s/it]


In [11]:
mag_bow = lambda bow: np.linalg.norm(bow, ord=2, axis=1)/bow.shape[1]

bow_ten_ks = {}
for ticker, sentiments in tqdm(sentiment_bow_ten_ks.items(), total=5):
    dates = ten_k_forms_dates[ticker]
    bow_ten_ks[ticker] = {
        sentiment: {
            date: mag_bow(bow)[date_idx] 
            for date_idx, date in enumerate(dates)
        } for sentiment, bow in sentiments.items()
    }

100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 1407.67it/s]


# TF-IDF

## 10Q

In [12]:
sentiment_tfidf_ten_qs = {}

for ticker, ten_qs in tqdm(ten_q_forms.items(), total=len(ten_q_forms.keys())):
    lemma_docs = [' '.join(ten_q) for ten_q in ten_qs]
    
    sentiment_tfidf_ten_qs[ticker] = {
        sentiment: get_tfidf(
            sentiment_df[sentiment_df[sentiment]!=0]['word'], 
            lemma_docs
        )
        for sentiment in sentiments
    }

100%|█████████████████████████████████████████████| 5/5 [02:50<00:00, 34.12s/it]


In [13]:
mag_tfidf = lambda tfidf: np.linalg.norm(tfidf, ord=2, axis=1)/tfidf.shape[1]

tfidf_ten_qs = {}
for ticker, sentiments in tqdm(sentiment_tfidf_ten_qs.items(), total=5):
    dates = ten_q_forms_dates[ticker]
    tfidf_ten_qs[ticker] = {
        sentiment: {
            date: mag_tfidf(tfidf)[date_idx] 
            for date_idx, date in enumerate(dates)
        } for sentiment, tfidf in sentiments.items()
    }

100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 566.77it/s]


## 10K

In [14]:
sentiment_tfidf_ten_ks = {}

for ticker, ten_ks in tqdm(ten_k_forms.items(), total=len(ten_k_forms.keys())):
    lemma_docs = [' '.join(ten_k) for ten_k in ten_ks]
    
    sentiment_tfidf_ten_ks[ticker] = {
        sentiment: get_tfidf(
            sentiment_df[sentiment_df[sentiment]!=0]['word'], 
            lemma_docs
        )
        for sentiment in sentiments
    }

100%|█████████████████████████████████████████████| 5/5 [01:46<00:00, 21.29s/it]


In [15]:
mag_tfidf = lambda tfidf: np.linalg.norm(tfidf, ord=2, axis=1)/tfidf.shape[1]

tfidf_ten_ks = {}
for ticker, sentiments in tqdm(sentiment_tfidf_ten_ks.items(), total=5):
    dates = ten_k_forms_dates[ticker]
    tfidf_ten_ks[ticker] = {
        sentiment: {
            date: mag_tfidf(tfidf)[date_idx] 
            for date_idx, date in enumerate(dates)
        } for sentiment, tfidf in sentiments.items()
    }

100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 1664.54it/s]


# Downloading all dictionaries

In [16]:
with open('sentiment_bow_ten_qs.pickle', 'wb') as handle:
    pickle.dump(sentiment_bow_ten_qs, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################

with open('bow_ten_qs.pickle', 'wb') as handle:
    pickle.dump(bow_ten_qs, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################
################################################################################################

with open('sentiment_bow_ten_ks.pickle', 'wb') as handle:
    pickle.dump(sentiment_bow_ten_ks, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################

with open('bow_ten_ks.pickle', 'wb') as handle:
    pickle.dump(bow_ten_ks, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################
################################################################################################
################################################################################################



with open('sentiment_tfidf_ten_qs.pickle', 'wb') as handle:
    pickle.dump(sentiment_tfidf_ten_qs, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################

with open('tfidf_ten_qs.pickle', 'wb') as handle:
    pickle.dump(tfidf_ten_qs, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################
################################################################################################

with open('sentiment_tfidf_ten_ks.pickle', 'wb') as handle:
    pickle.dump(sentiment_tfidf_ten_ks, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################

with open('tfidf_ten_ks.pickle', 'wb') as handle:
    pickle.dump(tfidf_ten_ks, handle, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################################