# SEC filings data pre-processing

In [None]:
from urllib.request import Request, urlopen
import pandas as pd
import numpy as np
import os
from os import listdir
%pip install w3lib
from w3lib.html import remove_tags

# Parsing and clenaing libraries
import re
from bs4 import BeautifulSoup
import string
import cleantext
import nltk
import inspect
import toolz
import tqdm
# nltk.download('stopwords')

### Data extraction

In [None]:
# url = "https://www.sec.gov/Archives/edgar/data/748015/0001047469-11-000234.txt"
# real file: "https://www.sec.gov/Archives/edgar/data/0000748015/000104746911000234/a2201619z10-k.htm"

# req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
# webpage = urlopen(req, timeout = 10).read()

In [1]:
class files_cleaning():
    '''
    Class that takes as input a company code (stock format)
    Creates a folder with all the clean 10-K files from this company
    '''
    def __init__(self, comp_code):
        self.comp_code = comp_code

    
    def create_comp_dic(self):
        '''
        Function that creates a dictionnary with all the company's files
        '''
        comp_path = './data/{}/'.format(self.comp_code)
        comp_raw_path = './data/{}/raw/'.format(self.comp_code)

        if not os.path.isdir(comp_path):
            raise Exception('The company has no valid folder!')

        if not os.path.isdir(comp_raw_path):
            raise Exception('The raw files folder is missing!')

        comp_files = ['./data/{}/raw/'.format(self.comp_code) + f for f in listdir('./data/{}/raw'.format(self.comp_code))]
        comp_files = [p for p in comp_files if p.split('/')[-1][0] != '.']
        comp_dic = {}
        for path in comp_files:
            with open(path, 'r') as f:
                year = path.split('_')[-1].replace('.txt', '')
                text = f.read()
                comp_dic[year] = text
        return(comp_dic)

    
    def textual_content(self, file):
        '''
        Takes a 10-K file as input
        Return the contextual information from the file
        '''
        # Define the regular expressions of the beginning / end of document
        text_start_pattern = re.compile(r'<DOCUMENT>') 
        text_end_pattern = re.compile(r'</DOCUMENT>')
        type_pattern = re.compile(r'<TYPE>10-K[^\n]+')

        final_content = []

        doc_start_list = [x.start() for x in text_start_pattern.finditer(file)] #assigns the first index from the starting pattern created before
        doc_end_list = [x.end() for x in text_end_pattern.finditer(file)] #assigns the last index from the ending pattern created before
        type_list = ['10-K']*len(doc_start_list) #assigns the type of the documents, which will always be 10-K's because we restricted it before

        for doc_type, start_index, end_index in zip(type_list, doc_start_list, doc_end_list):
            # Remove the last line? Looks to be useless in a few docs, TO CHECK
            final_content.append(file[start_index:end_index])
        return(' '.join(final_content))


    def text_selection(self, html):
        '''
        Function that takes as input an html file
        Returns the text extracted from that file
        '''
        notags = remove_tags(html)
        soup = BeautifulSoup(notags,'html.parser')
        return soup.get_text()


    def clean_files(self):
        '''
        Function that takes as input the raw files
        Return the clean files in a dictionnary
        '''
        comp_dic = self.create_comp_dic()

        # Extract the textual content (isolate a portion of the html) 
        # print('Extracting relevant parts...')
        # comp_dic_cont = toolz.valmap(self.textual_content, comp_dic)

        # Extract the text
        print('Extracting textual content...')
        comp_dic_selec = {}
        for year, html in comp_dic.items():
            html_sel = self.text_selection(html)
            comp_dic_selec[year] = html_sel

        # comp_dic_selec = toolz.valmap(self.text_selection, comp_dic_cont)

        # Clean the text (long to run...)
        clean = lambda html: cleantext.clean(html, clean_all = True)
        comp_dic_clean = {}
        print('Cleaning the text (might take long)...')
        for year, html in comp_dic_selec.items():
            out_url = './data/{comp}/clean/{comp}_{year}_clean.txt'.format(comp=self.comp_code, year=year)

            # Check if the clean file already exists or not
            if not os.path.exists(out_url):
                clean_html = clean(html)
                comp_dic_clean[year] = clean_html

        return(comp_dic_clean)

    # Save the cleaned strings as txt files
    def write_clean_files(self):
        '''
        Function that saves the clean strings from the original html into files
        '''
        comp_dic_clean = self.clean_files()
        comp_clean_path = './data/{}/clean/'.format(self.comp_code)
            
        if not os.path.isdir(comp_clean_path):
            os.makedirs(comp_clean_path)

        print('Writing the clean files...')
        for year, clean_string in comp_dic_clean.items():
            # Output url
            out_url = './data/{comp}/clean/{comp}_{year}_clean.txt'.format(comp=self.comp_code, year=year)

            # Create the file
            clean_file = open(out_url, 'wt')

            # Remove non-utf8 characters
            clean_string = ''.join(x for x in clean_string if x in string.printable)

            # Write the file
            n = clean_file.write(clean_string)
            clean_file.close()

In [None]:
# Create the Apple object
# apple_files = files_cleaning('AAPL')

# Create the clean files
# apple_files.write_clean_files()

In [None]:
# Create the Tesla object
# tsla_files = files_cleaning('TSLA')

# Create the clean files
# tsla_files.write_clean_files()