# Processing
This notebooks contains all the functions needed to download all the texts and also process them.


In [None]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from nltk.probability import FreqDist
import spacy
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

## Import the texts

In [None]:
def import_texts():
    """Function that will find all the earning calls downloaded and import them. It returns a list with all the texts."""
    dir_path = "data/text/"
    list_tickers = os.listdir(dir_path)
    texts = []
    articles = []
    compagny = []
    for ticker in list_tickers:
        earning_call_path = "data/text/"+ticker
        list_articles = os.listdir(earning_call_path)
        
        list_articles = [earning_call_path+'/'+x for x in list_articles if x!='.DS_Store' and x!= '.ipynb_checkpoints']
        for s in list_articles:
            with open(s) as f:
                x = int(re.sub(earning_call_path+'/','',s))
                articles.append(x)
                t = f.read()
                texts.append(t)
                compagny.append(ticker)
        
    print('Number of articles', len(texts))
    
    df = pd.DataFrame({'ticker':compagny,'article':articles,'text':texts})
    df.text = df.text.apply(lambda x : re.sub('Question-and-Answer Session','',x))
    return df

## Processing

In [None]:
def list_to_text(list_input, stops = []):
    "Function that take as input a text that have been tokenized and put it back into one single string"
    text_output = ' '.join([word for word in list_input if word not in stops]) 
    return text_output

In [None]:
def text_to_list(text_input):
    """Function that takes as input a text and tokenize it"""
    list_output = word_tokenize(text_input)
    return list_output

In [None]:
def ngrams_list(n):
    """Function that takes as input n which correspond to the blocks that we want to do with n-grams"""
    m = []
    nx_grams = ngrams(sequence = nltk.word_tokenize(text), n = n)
    for gram in nx_grams:
        m.append(gram)
    return m

In [None]:
def remove(filtered_text, a, l, u):
    """Remove most and least frequent words with a given lower (l) and upper (u) bound to remove certain percentage of occurences"""
    f = FreqDist(a)

    df_fdist = pd.DataFrame({'Word': f.keys(), 'Number of apparitions': f.values()})
    L= l*len(df_fdist)
    L=int(L)

    H=u*len(df_fdist)
    H=int(H)
    
    df_fdesc = df_fdist.sort_values(by='Number of apparitions', ascending=False)
    df_fasc = df_fdist.sort_values(by='Number of apparitions', ascending=True)

    most_freq_words_list = list(df_fdesc['Word'][:H])
    least_freq_word_list = list(df_fasc['Word'][:L])
    stopwords = most_freq_words_list + least_freq_word_list
    textlist_wo_extremes = list_to_text(filtered_text, stopwords)

    return textlist_wo_extremes

In [None]:
def remove_speakers(text):
    """Function ta remove the speakers from one text"""
    sentences = []
    list_string = text.split('\n \n')
    for s in list_string:
        if(len(s.split(' '))>3):
            sentences.append(s)
    return ' '.join(sentences)

In [None]:
def processing(text, lower_bound, upper_bound, noun):
    """Function that combine all the processing steps"""
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w.lower() in stop_words]
    x = [WordNetLemmatizer.lemmatize(word, 'n') for word in filtered_text]
    filter1 = remove(filtered_text, x ,lower_bound, upper_bound)
    if(noun):
        return remove_speakers(filter1)
    else: 
        return filter1

### Processing FiGAS

In [None]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def sentences_process(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return ' '.join(sentences)

In [None]:
from collections import Counter
def remove_most_least_freq(list_words, lower, upper):
    
    f = FreqDist(list_words)
    df_fdist = pd.DataFrame({'Word': f.keys(), 'Number of apparitions': f.values()})
    num_lower=int(lower*len(df_fdist))
    num_upper=int(upper*len(df_fdist))
    
    vocabulary = Counter(list_words)
    sorted_vocabulary = sorted(vocabulary.items(), key=lambda x: x[1])
    most_common = sorted_vocabulary[-num_upper:][::-1]
    least_common = sorted_vocabulary[:num_lower]
    stopwords = [x[0] for x in most_common+least_common]
    
    return list_to_text(list_words, stopwords)
    

In [None]:
def ner_filter(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    words = []
    for ent in doc.ents:
        if ent.label_ not in  ['LOC','GPE','PERSON']:
            words.append('JULIETTE')

    return list_to_text(words)
        

In [None]:
def process_figas(text):
    filter_speaker = remove_speakers(text)
    filter_char = sentences_process(filter_speaker)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(filter_char)
    filter_stopwords = [w for w in word_tokens if not w.lower() in stop_words]
    
    filter_freq = remove_most_least_freq(filter_stopwords, 0.06, 0.06)
    
    tokenization = word_tokenize(filter_freq)
    lemmatizer = WordNetLemmatizer()
    lemmatization = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    
    return lemmatization
    

In [None]:
df = import_texts()

In [None]:
df

In [None]:
df['text_processed'] = df.text.apply(lambda x: process_figas(x))

In [None]:
df['l'] = df.text_processed.apply(lambda x: len(x))

In [None]:
df[df.l==0].article.values

In [None]:
list_process_0 = [3440166, 4358280, 2343215, 4387002,  320064, 4387940, 2600265,
       2375305,  729441,  572641, 3834286, 3965900, 3074656, 2144593,
       4012772,  321956,  234463, 4342792]


In [None]:
df = df[df.l>0]

In [None]:
def top_n_TFIDF_words(n, texts_processed):
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(texts_processed)

    # Create dictionnary with all the words contained in the TF-IDF matrix
    dict_w_index = vectorizer.vocabulary_
    dict_index_w = {v: k for k, v in dict_w_index.items()}
    
    top_n = []
    for i in range(X_tfidf.shape[0]):
        index = X_tfidf[i,].nonzero()[1]
        words_of_index = [dict_index_w[x] for x in index]
        score_of_index = [X_tfidf[i,x] for x in index]
        x = list(zip(words_of_index,score_of_index))
        x.sort(key=lambda x: -x[1])
        a = [w[0] for w in x[:n]]
        top_n.append(a)
    
    return top_n

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df['tfidf_top_10_words'] = top_n_TFIDF_words(10, df.text_processed)

In [None]:
df.head(10)

In [None]:
df_pickle = pd.DataFrame({'ticker':df.ticker,'article':df.article,'top_n_words':df.tfidf_top_10_words})
df_pickle.to_pickle("data/sp500_top_10_words_tfidf_proc_figass_by_corpus.pkl")  

### TF-IDF BY TEXT

In [None]:
def remove_speakers2(text):
    """Function ta remove the speakers from one text"""
    sentences = []
    list_string = text.split('\n \n')
    for s in list_string:
        if(len(s.split(' '))>3):
            sentences.append(s)
    return '||'.join(sentences)

In [None]:
def process_figas2(text):
    filter_speaker = remove_speakers2(text)
    filter_char = sentences_process(filter_speaker)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(filter_char)
    filter_stopwords = [w for w in word_tokens if not w.lower() in stop_words]
    
    filter_freq = remove_most_least_freq(filter_stopwords, 0.06, 0.06)
    
    tokenization = word_tokenize(filter_freq)
    lemmatizer = WordNetLemmatizer()
    lemmatization = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    
    return lemmatization

In [None]:
def tfidf_by_text(text):
    list_sentences = text.split('||')
    out = ['.'.join(list_sentences[k:k+2]) for k in range(0, len(list_sentences), 2)]
    top_3_words = top_n_TFIDF_words(3, out)
    top_words = set(sum(top_3_words, []))
    return top_words

In [None]:
df_tfidf_by_text = df[['ticker','article','text']]
df_tfidf_by_text['text_processed'] = df_tfidf_by_text.text.apply(lambda x : process_figas2(x))

In [None]:
df_tfidf_by_text.head(10)

In [None]:
df_tfidf_by_text['top_10_words'] = df_tfidf_by_text.text_processed.apply(lambda x :tfidf_by_text(x))

In [None]:
df_tfidf_by_text

In [None]:
df_pickle2 = pd.DataFrame({'ticker':df_tfidf_by_text.ticker,'article':df_tfidf_by_text.article,'top_n_words':df_tfidf_by_text.top_10_words})
df_pickle2.to_pickle("data/sp500_top_10_words_tfidf_proc_figass_by_text.pkl")  

### Name Entity Recognition

- python -m spacy download en_core_web_lg
- python -m spacy download en_core_web_sm


SpaCy:
- **PERSON**:      People, including fictional.
- NORP:        Nationalities or religious or political groups.
- FAC:         Buildings, airports, highways, bridges, etc.
- **ORG**:         Companies, agencies, institutions, etc.
- GPE:         Countries, cities, states.
- **LOC**:         Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
- EVENT:       Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART: Titles of books, songs, etc.
- LAW:         Named documents made into laws.
- LANGUAGE:    Any named language.
- **DATE**:        Absolute or relative dates or periods.
- **TIME**:        Times smaller than a day.
- PERCENT:     Percentage, including ”%“.
- MONEY:       Monetary values, including unit.
- QUANTITY:    Measurements, as of weight or distance.
- ORDINAL:     “first”, “second”, etc.
- CARDINAL:    Numerals that do not fall under another type.


In [None]:

from collections import Counter
dictionary=Counter(tokens_)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Read the whole text.
wordcloud = WordCloud(width=1600, height=800, colormap="hsv", background_color='white').generate_from_frequencies(dictionary)
# Open a plot of the generated image.

plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig('imagedist.png')
plt.show()