# Processing
This notebooks contains all the functions needed to download all the texts and also process them.


In [1]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from nltk.probability import FreqDist
import spacy
import re
from collections import Counter

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tatianacogne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tatianacogne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tatianacogne/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tatianacogne/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tatianacogne/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Import the texts

In [2]:
def import_texts():
    """Function that will find all the earning calls downloaded and import them. It returns a list with all the texts."""
    dir_path = "data/text/"
    list_tickers = os.listdir(dir_path)
    texts = []
    articles = []
    compagny = []
    list_tickers.remove('.DS_Store')
    for ticker in list_tickers:
        earning_call_path = "data/text/"+ticker
        list_articles = os.listdir(earning_call_path)
        
        list_articles = [earning_call_path+'/'+x for x in list_articles if x!='.DS_Store' and x!= '.ipynb_checkpoints']
        for s in list_articles:
            with open(s) as f:
                x = int(re.sub(earning_call_path+'/','',s))
                articles.append(x)
                t = f.read()
                texts.append(t)
                compagny.append(ticker)
        
    print('Number of articles', len(texts))
    
    df = pd.DataFrame({'ticker':compagny,'article':articles,'text':texts})
    df.text = df.text.apply(lambda x : re.sub('Question-and-Answer Session','',x))
    return df

## Processing

In [3]:
def list_to_text(list_input, stops = []):
    "Function that take as input a text that have been tokenized and put it back into one single string."
    text_output = ' '.join([word for word in list_input if word not in stops]) 
    return text_output

In [4]:
def text_to_list(text_input):
    """Function that takes as input a text and tokenize it."""
    list_output = word_tokenize(text_input)
    return list_output

In [5]:
def ngrams_list(n):
    """Function that takes as input n which correspond to the blocks that we want to do with n-grams."""
    m = []
    nx_grams = ngrams(sequence = nltk.word_tokenize(text), n = n)
    for gram in nx_grams:
        m.append(gram)
    return m

In [6]:
def remove_speakers(text):
    """Function ta remove the speakers from one text for the first option on FiGAS."""
    sentences = []
    list_string = text.split('\n \n')
    for s in list_string:
        if(len(s.split(' '))>5):
            sentences.append(s)
    return ' '.join(sentences)

In [7]:
def remove_speakers2(text):
    """Function ta remove the speakers from one text for the second option."""
    sentences = []
    list_string = text.split('\n \n')
    for s in list_string:
        if(len(s.split(' '))>5):
            sentences.append(s)
    return '||'.join(sentences)

In [8]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [9]:
def sentences_process(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return ' '.join(sentences)

In [10]:
def remove_most_least_freq(list_words, lower, upper):
    """Remove most and least frequent words with a given lower (l) and upper (u) bound to remove certain percentage of occurences."""
    f = FreqDist(list_words)
    df_fdist = pd.DataFrame({'Word': f.keys(), 'Number of apparitions': f.values()})
    num_lower=int(lower*len(df_fdist))
    num_upper=int(upper*len(df_fdist))
    
    vocabulary = Counter(list_words)
    sorted_vocabulary = sorted(vocabulary.items(), key=lambda x: x[1])
    most_common = sorted_vocabulary[-num_upper:][::-1]
    least_common = sorted_vocabulary[:num_lower]
    stopwords = [x[0] for x in most_common+least_common]
    
    return list_to_text(list_words, stopwords)

### Processing FiGAS

In [11]:
def process_figas(text):
    """Function that combine all the processing steps for the first option on FiGAS."""
    filter_speaker = remove_speakers(text)
    filter_char = sentences_process(filter_speaker)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(filter_char)
    filter_stopwords = [w for w in word_tokens if not w.lower() in stop_words]
    tagged = nltk.pos_tag(filter_stopwords)
    filter_noun = [w[0] for w in tagged if w[1] !='NNP']
    filter_freq = remove_most_least_freq(filter_noun, 0.06, 0.06)
    tokenization = word_tokenize(filter_freq)
    lemmatizer = WordNetLemmatizer()
    lemmatization = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    
    return lemmatization

In [12]:
def process_figas2(text):
    """Function that combine all the processing steps for the second option on FiGAS."""
    filter_speaker = remove_speakers2(text)
    filter_char = sentences_process(filter_speaker)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(filter_char)
    filter_stopwords = [w for w in word_tokens if not w.lower() in stop_words]
    tagged = nltk.pos_tag(filter_stopwords)
    filter_noun = [w[0] for w in tagged if w[1] !='NNP']
    filter_freq = remove_most_least_freq(filter_noun, 0.06, 0.06)
    tokenization = word_tokenize(filter_freq)
    lemmatizer = WordNetLemmatizer()
    lemmatization = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    
    return lemmatization