In [27]:
# importing the necessary files
import numpy as np
import pandas as pd
import requests
import bs4 as bs
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from selenium import webdriver
import re
import time
import csv

In [64]:
baselink = 'https://www.sec.gov/Archives/'# defining the baselink 
cik_list = pd.read_csv('cik_list.csv').fillna("") # opening the cik_list file
links =[ i for i in cik_list['SECFNAME'].tolist() if i!=""]

In [65]:
master_dict = pd.read_csv('LoughranMcDonald_MasterDictionary_2020.csv')
positive_words = master_dict[master_dict['Positive']==2009]['Word'] # find the positive words from the master dictionary
negative_words = master_dict[master_dict['Negative']==2009]['Word'] # finding the negative words from the master dictionary

In [66]:
positive_words = list(map(lambda x:x.lower(), positive_words))
negative_words = list(map(lambda x:x.lower(), negative_words))

In [67]:
uncertainity_dictionary = pd.read_csv('uncertainty_dictionary.csv')['Word']
# making a list of words in the uncertinaity dictioary
uncertainity_dictionary = list(map(lambda x : x.lower(), uncertainity_dictionary)) 


constraining_dictionary = pd.read_csv('constraining_dictionary.csv')['Word']
# making the list of words in the constraning dictionary 
constraining_dictionary = list(map(lambda x: x.lower(), constraining_dictionary))

In [68]:
stop_words_file_names = ['StopWords_GenericLong.txt','StopWords_Generic.txt','StopWords_Auditor.txt','StopWords_Names.txt']
stop_words=[]
for i in stop_words_file_names:
    file= pd.read_csv(i, header = None)
    stop_words.extend(list(map(lambda x: x.lower() if isinstance(x,str) else x, file.iloc[:,0])))

In [69]:
dataset= pd.read_csv('Output Data Structure.csv')
col = dataset.columns

In [70]:
def syllable_count(word):
    """
        count the number of syllables in the word
    """
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count


In [35]:
# to store the values for each document
data = []

In [71]:
def value_for_each_text(text, no_of_sentences, cil, coname,fyrmo,fdate,form,secfname):
    """input-> list of cleaned sentences scrapped from a particular link
        rest input is the data about the file corresponding to that link
    """
    # positive_score, negetive_score
    positive_score = 0
    negative_score = 0
    for i in text:
        for j in i.split():
            if j in negative_words:
                negative_score += 1
            elif j in positive_words:
                positive_score +=1
    # polarity_score
    polarity_score = (positive_score-negative_score)/(positive_score+negative_score+0.000001)
    
    subjectivity_score = (positive_score+negative_score)/(len(text)+0.000001)
    
    word_count = 0
    for i in text:
        word_count += len(i.split())
  
    # average_sentence_length
    average_sentence_length = word_count/no_of_sentences
    
    no_complex_words = 0
    for i in text:
        for j in i.split():
            if syllable_count(j)>2:
                no_complex_words+=1
    percentage_complex_words = no_complex_words/len(text)
    
    #fog index
    fog_index = 0.4*(percentage_complex_words+average_sentence_length)
    # word count
    
    # uncertainity
    uncertainity = 0
    constraining=0
    for i in text:
        for j in i.split():
            if j in uncertainity_dictionary:
                uncertainity +=1
            if j in constraining_dictionary:
                constraining+=1
                
                
    positive_word_proportion = positive_score/no_of_sentences
    negative_word_proportion = negative_score/no_of_sentences
    constraining_word_proportion = constraining/no_of_sentences
    uncertainity_word_proportion = uncertainity/no_of_sentences
    
    data.append([cil,coname,fyrmo,fdate,form,secfname,positive_score,\
                             negative_score,polarity_score,average_sentence_length,\
                             percentage_complex_words,fog_index,no_complex_words,word_count,uncertainity,\
                             constraining,positive_word_proportion,negative_word_proportion,\
                             uncertainity_word_proportion, constraining_word_proportion,constraining])
    
    
    

In [37]:
"""the takes a list of sentences for each each document
return the cleaned list of sentences where each sentence is manipulated as follows
converted to smaller case, then removed the html tags, removed consecutive white spaces, removed special characters
then tokenized the sentence, removed the stop words, stemmed the words,  return the clean sentences  
"""
def preprocess(sentences):
    ps = PorterStemmer()
    cleaned_sentences=[]
    tokenizer = RegexpTokenizer('\w+')
    print("the number of sentences to be preprocessed ", len(sentences))
    for x in sentences:
        x = x.lower()
        re.sub(r'<[^>]*>',' ', x)
        re.sub(r'\s+','', x)
        re.sub(r'\W',' ', x)
        x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
        x = re.sub(r"([0-9]+)000000", r"\1m", x)
        x = re.sub(r"([0-9]+)000", r"\1k", x)
        x = tokenizer.tokenize(x)
        x = [ i for i in x if i not in stop_words]
        x = [ ps.stem(i) for i in x]
        cleaned_sentence = " ".join(x)
        cleaned_sentences.append(cleaned_sentence)
        
    return cleaned_sentences
        
        

In [38]:
"""this function takes the full URL of the file and then extracts the data and returns list of sentences """
all_text = []

def get_text(full_link):
    time.sleep(15)
    driver.get(full_link)
    arr = driver.find_elements_by_tag_name('body')

   
    soup= bs.BeautifulSoup(arr[0].text, 'lxml')
    
    
    so=soup.find_all('page')
    sentences = []
    for i in range(len(so)):
        sentences.extend([j.strip() for j in so[i].text.strip().split('\n') if j!=''])
    
    if len(sentences)==0:
        so = soup.find_all('text')
        for i in range(len(so)):
            sentences.extend([j.strip() for j in so[i].text.strip().split('\n') if j!=''])       
    clean_sentences = preprocess(sentences)
    all_text.append(clean_sentences)
    return clean_sentences
    


In [41]:
driver = webdriver.Chrome(executable_path ='C:/Users/91885/Desktop/chromedriver')
driver.maximize_window()
"""Using  Selenium as the web driver to go to each link of web page"""
for i in range(len(links)):
    # forming the working link
    full_link = baselink+links[i]
    print(full_link)
    document= get_text(full_link)
    value_for_each_text(document, len(document), cik_list.iloc[i]['CIK'],cik_list.iloc[i]['CONAME'],cik_list.iloc[i]['FYRMO'], cik_list.iloc[i]['FDATE'],cik_list.iloc[i]['FORM'], cik_list.iloc[i]['SECFNAME'] )
    

https://www.sec.gov/Archives/edgar/data/6201/0000950134-01-500665.txt
the number of sentences to be preprocessed  8935
https://www.sec.gov/Archives/edgar/data/6201/0000006201-01-500032.txt
the number of sentences to be preprocessed  14918
https://www.sec.gov/Archives/edgar/data/6201/0000006201-01-500047.txt
the number of sentences to be preprocessed  23499
https://www.sec.gov/Archives/edgar/data/6201/0000950134-02-001661.txt
the number of sentences to be preprocessed  159769
https://www.sec.gov/Archives/edgar/data/6201/0000006201-02-000015.txt
the number of sentences to be preprocessed  7626
https://www.sec.gov/Archives/edgar/data/6201/0000006201-02-000035.txt
the number of sentences to be preprocessed  25817
https://www.sec.gov/Archives/edgar/data/6201/0000006201-02-000052.txt
the number of sentences to be preprocessed  25289
https://www.sec.gov/Archives/edgar/data/6201/0000950134-02-012680.txt
the number of sentences to be preprocessed  10159
https://www.sec.gov/Archives/edgar/data/6

In [63]:
"""Here I am making a CSV file which stores the result of each file link"""

with open('Output Data Strucuture to be submitted.xlsx', 'w', encoding='UTF8', newline="") as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(col)

    # write the data
    writer.writerows(data)