# Project 1 Step 5 Negative Word Proportions
In this notebook, we will do the following:
- Read in Harvard IV Negative Word list Dictionary and the Fin-Neg Dictionary produced by the author of the paper, clean and prepare them
- Read in all the JSON files which contain the dictionaries of the word list of 10-K and 10Q for different tickers from 2011-2021
- Convert word lists to negative word proportions using TF-IDF method
- Map stock 4-day buy-and-hold excess return data calculated from the previous step/notebook to these negative word proportions and filing dates
- Plot Figure 1 in the paper

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")

from datetime import date
from tqdm import tqdm
from joblib import Parallel, delayed
import multiprocessing

from bs4 import BeautifulSoup
import re
from pathlib import Path
import json

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer  # for bag-of-words method
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF method

In [3]:
data_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/"
data_path_10q = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/10Q/"
data_path_10k = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/10K/"

cik_lookup_filename = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/CIK_lookup_results_cleaned.csv"
sp500_constituents_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/sp500_constituents.csv"
sp500_id_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/sp500_w_addl_id.csv"

index_return_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/Index_Returns.csv"
stock_return_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/Stock_Prices.csv"

word_list_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/WordLists"
H4N_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/Dictionary/Harvard IV_Negative Word List_Inf.txt"
FinNeg_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/Dictionary/Loughran-McDonald_MasterDictionary_1993-2021.csv"
neg_prop_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/NegWordProportions"

#### Read in Harvard IV Negative Word List dictionary and Loughran-McDonald Master Dictionary

Read in Harvard IV Negative Word List and turn it into a list of words.

In [4]:
# Read in the .txt file
harvard_dict = Path(H4N_path).read_text()

In [5]:
harvard_dict

'ABANDON\nABANDONED\nABANDONING\nABANDONMENT\nABANDONMENTS\nABANDONS\nABATE\nABATED\nABATEMENT\nABATEMENTS\nABATES\nABATING\nABDICATE\nABDICATED\nABDICATES\nABDICATING\nABDICATION\nABDICATIONS\nABHOR\nABHORRED\nABHORRENCE\nABHORRENCES\nABHORRENT\nABHORRENTLY\nABHORS\nABJECT\nABJECTION\nABJECTIONS\nABJECTLY\nABJECTNESS\nABNORMAL\nABNORMALITIES\nABNORMALITY\nABNORMALLY\nABOLISH\nABOLISHED\nABOLISHES\nABOLISHING\nABOMINABLE\nABRASIVE\nABRASIVENESS\nABRASIVES\nABRUPT\nABRUPTLY\nABRUPTNESS\nABSCOND\nABSCONDED\nABSCONDING\nABSENCE\nABSENCES\nABSENT\nABSENTED\nABSENTEE\nABSENTEEISM\nABSENTEES\nABSENTMINDED\nABSURD\nABSURDITY\nABUSE\nABUSED\nABUSER\nABUSERS\nABUSES\nABUSING\nABUSIVE\nABUSIVELY\nABUSIVENESS\nABYSS\nABYSSAL\nACCIDENT\nACCIDENTAL\nACCIDENTALLY\nACCIDENTS\nACCOST\nACCOSTED\nACCOSTING\nACCOSTS\nACCURSED\nACCUSATION\nACCUSATIONS\nACCUSE\nACCUSED\nACCUSER\nACCUSERS\nACCUSES\nACCUSING\nACHE\nACHED\nACRIMONIOUS\nACRIMONY\nADDICT\nADDICTED\nADDICTING\nADDICTION\nADDICTIONS\nADDICTIVE\nA

We could notice that the harvard dict is a string and each word is on a separate line

#### Convert to a list

In [6]:
harvard_dict = harvard_dict.split("\n")

In [7]:
print(harvard_dict)

['ABANDON', 'ABANDONED', 'ABANDONING', 'ABANDONMENT', 'ABANDONMENTS', 'ABANDONS', 'ABATE', 'ABATED', 'ABATEMENT', 'ABATEMENTS', 'ABATES', 'ABATING', 'ABDICATE', 'ABDICATED', 'ABDICATES', 'ABDICATING', 'ABDICATION', 'ABDICATIONS', 'ABHOR', 'ABHORRED', 'ABHORRENCE', 'ABHORRENCES', 'ABHORRENT', 'ABHORRENTLY', 'ABHORS', 'ABJECT', 'ABJECTION', 'ABJECTIONS', 'ABJECTLY', 'ABJECTNESS', 'ABNORMAL', 'ABNORMALITIES', 'ABNORMALITY', 'ABNORMALLY', 'ABOLISH', 'ABOLISHED', 'ABOLISHES', 'ABOLISHING', 'ABOMINABLE', 'ABRASIVE', 'ABRASIVENESS', 'ABRASIVES', 'ABRUPT', 'ABRUPTLY', 'ABRUPTNESS', 'ABSCOND', 'ABSCONDED', 'ABSCONDING', 'ABSENCE', 'ABSENCES', 'ABSENT', 'ABSENTED', 'ABSENTEE', 'ABSENTEEISM', 'ABSENTEES', 'ABSENTMINDED', 'ABSURD', 'ABSURDITY', 'ABUSE', 'ABUSED', 'ABUSER', 'ABUSERS', 'ABUSES', 'ABUSING', 'ABUSIVE', 'ABUSIVELY', 'ABUSIVENESS', 'ABYSS', 'ABYSSAL', 'ACCIDENT', 'ACCIDENTAL', 'ACCIDENTALLY', 'ACCIDENTS', 'ACCOST', 'ACCOSTED', 'ACCOSTING', 'ACCOSTS', 'ACCURSED', 'ACCUSATION', 'ACCUSATIO

#### Convert all words to lower case 

In [8]:
harvard_dict = [x.lower() for x in harvard_dict]
print(harvard_dict)

['abandon', 'abandoned', 'abandoning', 'abandonment', 'abandonments', 'abandons', 'abate', 'abated', 'abatement', 'abatements', 'abates', 'abating', 'abdicate', 'abdicated', 'abdicates', 'abdicating', 'abdication', 'abdications', 'abhor', 'abhorred', 'abhorrence', 'abhorrences', 'abhorrent', 'abhorrently', 'abhors', 'abject', 'abjection', 'abjections', 'abjectly', 'abjectness', 'abnormal', 'abnormalities', 'abnormality', 'abnormally', 'abolish', 'abolished', 'abolishes', 'abolishing', 'abominable', 'abrasive', 'abrasiveness', 'abrasives', 'abrupt', 'abruptly', 'abruptness', 'abscond', 'absconded', 'absconding', 'absence', 'absences', 'absent', 'absented', 'absentee', 'absenteeism', 'absentees', 'absentminded', 'absurd', 'absurdity', 'abuse', 'abused', 'abuser', 'abusers', 'abuses', 'abusing', 'abusive', 'abusively', 'abusiveness', 'abyss', 'abyssal', 'accident', 'accidental', 'accidentally', 'accidents', 'accost', 'accosted', 'accosting', 'accosts', 'accursed', 'accusation', 'accusatio

#### Check the length of harvard dictionary

In [9]:
print(len(harvard_dict))

4188


#### Next we will read in the Loughran-McDonald Financial Master Dictionary. We will remove all unnecessary columns, and turn the relevant negative words into a list.

In [10]:
# Read in the Fin-Neg dictionary
finneg_dict = pd.read_csv(FinNeg_path)
finneg_dict

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Syllables,Source
0,AARDVARK,1,354,1.550080e-08,1.422600e-08,3.815486e-06,99,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.313627e-10,8.653817e-12,9.241714e-09,1,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,3.940882e-10,1.169679e-10,5.290465e-08,7,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,29,1.269840e-09,6.654735e-10,1.595100e-07,28,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,8570,3.752595e-07,3.809464e-07,3.529356e-05,1108,0,0,0,0,0,0,0,3,12of12inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86526,ZYGOTE,86529,50,2.189379e-09,8.729336e-10,1.886011e-07,35,0,0,0,0,0,0,0,2,12of12inf
86527,ZYGOTES,86530,1,4.378757e-11,1.809516e-11,1.932446e-08,1,0,0,0,0,0,0,0,2,12of12inf
86528,ZYGOTIC,86531,0,0.000000e+00,0.000000e+00,0.000000e+00,0,0,0,0,0,0,0,0,3,12of12inf
86529,ZYMURGIES,86532,0,0.000000e+00,0.000000e+00,0.000000e+00,0,0,0,0,0,0,0,0,3,12of12inf


In [11]:
finneg_dict = finneg_dict[['Word', 'Negative']]
finneg_dict.head()

Unnamed: 0,Word,Negative
0,AARDVARK,0
1,AARDVARKS,0
2,ABACI,0
3,ABACK,0
4,ABACUS,0


According to the author's website documentation for the dictionary, it said that "The sentiment categories are: negative, positive, uncertainty, litigious, strong modal, weak modal, and constraining. The sentiment words are flagged with a number indicating the year in which they were added to the list. Note: A year preceded by a negative sign indicates the year/version when the word was removed from the sentiment category". 

This means we will first select out all words with non-zero entries. Then we check whether there are any words with negative year value; if there are any, we will remove those words.

In [12]:
finneg_dict = finneg_dict.loc[finneg_dict['Negative'] != 0]
finneg_dict = finneg_dict.reset_index(drop=True, inplace=False)
finneg_dict

Unnamed: 0,Word,Negative
0,ABANDON,2009
1,ABANDONED,2009
2,ABANDONING,2009
3,ABANDONMENT,2009
4,ABANDONMENTS,2009
...,...,...
2350,WRONGDOING,2009
2351,WRONGDOINGS,2009
2352,WRONGFUL,2009
2353,WRONGFULLY,2009


#### Check whether there are negative year value in "Negative" column

In [13]:
finneg_dict.loc[finneg_dict['Negative'] < 0]

Unnamed: 0,Word,Negative
168,BREAKING,-2020
176,BRIDGE,-2020
177,BROKEN,-2020
229,CLOSED,-2020
232,CLOSING,-2020
392,CRITICAL,-2020
1032,FORCE,-2020
1070,FUGITIVE,-2020
1362,LATE,-2020
1937,SECRECY,-2020


In [14]:
finneg_dict.loc[finneg_dict['Negative'] >= 2011].loc[finneg_dict['Negative'] <= 2021]

Unnamed: 0,Word,Negative
136,AVERSELY,2011
415,CYBERATTACK,2014
416,CYBERATTACKS,2014
417,CYBERBULLYING,2014
418,CYBERCRIME,2014
419,CYBERCRIMES,2014
420,CYBERCRIMINAL,2014
421,CYBERCRIMINALS,2014
518,DELISTS,2011
1432,MISCHARACTERIZATION,2014


After checking, we found that there are a few words that are removed from the dictionary in 2020. Also, there are quite a few words that are added in 2011 and 2014. Making several versions of dictionaries can be too tedious, so for simplicity, we decide to include those words added in 2011 and 2014. Also, we will also include those 10 words that got removed from the dictionary in 2020. The first reason is that 10 words will not make a huge difference because the dictionary is 2500+ words. The second reason is that if those words got removed in 2020, that means they are still considered as negative words until 2020. Our sample from 2021 is just 1/11 of the sample, so including those 10 words will not bias our result by a significant amount. 

#### Make a list of all words

In [15]:
finneg_dict = finneg_dict['Word'].values.tolist()
print(finneg_dict)



#### Convert all words to lower case

In [16]:
finneg_dict = [x.lower() for x in finneg_dict]
print(finneg_dict)



#### Calculate Negative Words Proportion Using TF-IDF Method 

We calculate negative words proportion using TF-IDF Method by following approaches:

    (1) We download and read in the Harvard IV Negative Word List Dictionary and the Fin-Neg Dictionary produced by the author of the paper, clean and prepare them.
    
    (2) Read in all the JSON files which contain the dictionaries of the word list of 10-K and 10Q for different tickers from 2011-2021. 
    
    (3) For both Harvard IV Negative Word List Dictionary and Fin-Neg Dictionary, we count the total occurrences of each negative word in each 10-K or 10-Q file. This procedure can be completed by the sklearn.feature_extraction.text.countvectorizer module for Python. The final output is two matrices. The number of rows of each matrix is the total number of 10-K and 10-Q files and the number of columns is the total number of negative words in the two negative word dictionaries respectively. 
    
    (4) For both Harvard IV Negative Word List Dictionary and Fin-Neg Dictionary, we compute the weight of each word in each 10-K or 10-Q file. This procedure can be completed by the sklearn.feature_extraction.text.TfidfVectorizer module in Python. We would get two matrices as the final output. The number of rows of each matrix is the total number of 10-K and 10-Q files and the number of columns is the total number of negative words in the two negative word dictionaries respectively. These matrices provide us the weight values corresponding to the negative word occurrences obtained from (3).
    
    (5) We multiply the weight values from 4) and the corresponding single negative word occurrences from 3) to get two matrices of weighted negative word occurrences for Negative Word List Dictionary and  Fin-Neg Dictionary respectively. 
    (6) For both of the matrices of weighted negative word occurrences, we sum up the matrix elements in each row to obtain the total number of negative words in the corresponding file. The total negative word occurrences in each file are then divided by the total word number of the file to get the negative word proportion.

The way we define Negative Words Proportion using TF-IDF Method is according to what the authors mentioned in the paper. Suppose we have $document_j$ and $word_{i,j}$, which represent a certain document we have and a word that exists in $document_j$. In $document_j$ we have M words. Also, we have the TF-IDF weight $weight_{i,j}$ calculated by above methods.

$$
negprop_j = \frac{\sum_{i=0}^{M} w_{i,j}word_{i,j}}{\sum_{i=0}^{M} word_{i,j}}
$$

    
   


In [17]:
def get_all_tenk_docs(wordlists_path):
  """
  Concatenate all 10-K document word lists together
  """
    all_tenk_docs = []
    ticker_list = []
    filing_date_list = []
    doc_length_list = []
    for i in range(5):
        json_read10k = open(os.path.join(word_list_path, "wordlist10k_{}.json".format(str(i))), "r")
        wordlists = json.loads(json_read10k.read())
        for ticker, ticker_dict in tqdm(wordlists.items()):
            for filing_date, doc in ticker_dict.items():
                doc_length_list.append(len(doc))
                ticker_list.append(ticker)
                filing_date_list.append(filing_date)
                all_tenk_docs.append(' '.join(doc))
        json_read10k.close()
    return ticker_list, filing_date_list, all_tenk_docs, doc_length_list  

In [18]:
def get_all_tenq_docs(wordlists_path):
  """
  Concatenate all 10-Q document word lists together
  """
    all_tenq_docs = []
    ticker_list = []
    filing_date_list = []
    doc_length_list = []
    for i in range(1, 7):
        json_read10q = open(os.path.join(word_list_path, "wordlist10q_part{}.json".format(str(i))), "r")
        wordlists = json.loads(json_read10q.read())
        for ticker, ticker_dict in tqdm(wordlists.items()):
            for filing_date, doc in ticker_dict.items():
                doc_length_list.append(len(doc))
                ticker_list.append(ticker)
                filing_date_list.append(filing_date)
                all_tenq_docs.append(' '.join(doc))
        json_read10q.close()
    return ticker_list, filing_date_list, all_tenq_docs, doc_length_list  

#### Call two auxiliary functions from above to get all documents' word lists
#### Saving all documents of 10-K and 10-Q

In [19]:
tenk_ticker_list, tenk_filing_date_list, all_tenk_docs, doc_length_tenk_list = get_all_tenk_docs(word_list_path)
tenq_ticker_list, tenq_filing_date_list, all_tenq_docs, doc_length_tenq_list = get_all_tenq_docs(word_list_path)
all_documents = all_tenk_docs + all_tenq_docs
all_ticker_list = tenk_ticker_list + tenq_ticker_list
all_filing_date_list = tenk_filing_date_list + tenq_filing_date_list
doc_length_list = doc_length_tenk_list + doc_length_tenq_list

100%|██████████| 108/108 [00:02<00:00, 40.87it/s]
100%|██████████| 108/108 [00:02<00:00, 39.02it/s]
100%|██████████| 108/108 [00:02<00:00, 53.31it/s]
100%|██████████| 108/108 [00:01<00:00, 106.41it/s]
100%|██████████| 108/108 [00:00<00:00, 420.28it/s]
100%|██████████| 111/111 [00:02<00:00, 45.47it/s]
100%|██████████| 115/115 [00:03<00:00, 35.42it/s]
100%|██████████| 83/83 [00:02<00:00, 28.10it/s]
100%|██████████| 82/82 [00:02<00:00, 34.56it/s]
100%|██████████| 82/82 [00:02<00:00, 30.66it/s]
100%|██████████| 78/78 [00:01<00:00, 40.81it/s]


In [20]:
def get_tfidf_negprop(all_documents, all_ticker_list, all_filing_date_list, doc_length_list, which_dict=True):
  """
  wordlists: a word list file containing word lists dictionaries from 10-K or 10-Qs
  which_dict: use Harvard dictionary or Loughran Mcdonald dictionary
  """
    neg_prop_dict = {}

    if which_dict == True:
        vocab = harvard_dict
        json_suffix = "harvard"
    else:
        vocab = finneg_dict
        json_suffix = "finneg"

  # calculate the tf-idf weights of each word
    tfidf_vec = TfidfVectorizer(vocabulary=vocab)
    tfidf_vector = tfidf_vec.fit_transform(all_documents)
    tfidf_weights = tfidf_vector.toarray()
  # calculate negative words proportion of each document
    count_vec = CountVectorizer(vocabulary=vocab)
    count_vector = count_vec.fit_transform(all_documents)
    count_matrix = count_vector.toarray()
    weighted_count = tfidf_weights * count_matrix
    neg_prop = np.sum(weighted_count, axis=1) / np.array(doc_length_list)
  # save the result dictionary into a dataframe
    neg_prop_df = pd.DataFrame({"ticker": all_ticker_list, "filing_date": all_filing_date_list, "NegProp_TFIDF": neg_prop})
    neg_prop_df.to_csv(os.path.join(neg_prop_path, "Neg_Prop_Result_{}.csv".format(json_suffix)), index=False)

#### Generate and save the dataframe with negative proportion calculated via tf-idf method

In [22]:
get_tfidf_negprop(all_documents, all_ticker_list, all_filing_date_list, doc_length_list, which_dict=True)
print("############ Harvard Dictionary Version Done ############")
get_tfidf_negprop(all_documents, all_ticker_list, all_filing_date_list, doc_length_list, which_dict=False)

############ Harvard Dictionary Version Done ############
