# Textual Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# This one is used to load web pages
from urllib.request import urlopen
import re
# This one is for textual analysis
import nltk 
#nltk.download('wordnet')
# This two are for combine lists
from functools import reduce 
from operator import add

## Step 1 function
Packaging everything into a function. 
Because a lot of things might go wrong (i.e., a page might be missing), 
Making sure the code works, but being able to catch errors. 
For that, using a try block, and print out any error. 

https://www.sec.gov/news/press-release/2017-9

In [2]:
# loading data from dta file
df_pr = pd.read_stata('SEC.dta')

## Pre-Processing Ⅰ: Tokenisation
Loading tools to do word tokenisation.
Using different tools like WhitespaceTokenizer, TweetTokenizer, etc

In [3]:
word_list = []
# Loading the tokenization tool
# Here we use whitespacetokenizer, it splits sentence into tokens by white space.
tokenizer = nltk.WhitespaceTokenizer()

for content in df_pr['body']:
    # save tokens for each content
    word_list.append(tokenizer.tokenize(content))

# compute words and unique_words after tokenization    
terms_words = reduce(add, word_list)
all_terms = len(terms_words)
all_unique = len(set(terms_words))

## Pre-Processing Ⅱ: Stopword Removal
Already having split sentences into words,
Needing to remove some stopwords which do not help distinguishone document from another.
Examples from English are 'a', 'the', 'to', "for' and so on.
Using word dictionary to detect which words are stopwords and remove them.

In [4]:
# get stopwords from http://snowball.tartarus.org/algorithms/english/stop.txt
stopwords = urlopen("http://snowball.tartarus.org/algorithms/english/stop.txt").readlines()

# decode word into utf8 encoding
stopword0 = [x.decode('utf-8').strip() for x in stopwords]

# extract stopwords from file lines and clean the words
stopword1 = [x.split('|')[1].strip() if x.startswith('|') else x.split('|')[0].strip() for x in stopword0]
stopword2 = [x for x in stopword1 if len(x) > 0 and len(x.split(' ')) == 1 and not re.findall("[^a-zA-Z']", x)]
# define the punctuation we need to remove
ch='[·’!"\#$%&\'()＃！（）*+,./:;<=>?\@，：?￥★、…．＞【】［］《》？“”‘’\[\\]^_`{|}~]+'

word_list_1 = []
for words in word_list:
    word_list_0 = []
    for word in words:   
        # if word take with '.', but not endwith '.' like u.s., 
        # we dont need to remove the punctuation
        if '.' in word and not word.endswith('.'):
            word = re.sub(re.sub('\.', '', ch), "", word)
        # if word is the last one in a sentence and takes with '.' 
        # we need to remove the '.'
        elif word.endswith('.'):
            word = word[:-1]
        # remove punctuation in word
        else:
            word = re.sub(ch, "", word)            
        # remove the single letter and stopwords
        if len(word) > 1 and word.lower() not in stopword2:
            word_list_0.append(word)
    word_list_1.append(word_list_0)

## Pre-Processing Ⅲ: Linguistic Roots
For many applications, 
the relevant information in tokens is their linguistic root, not their grammatical form. 
Should treat prefer','prefers','preferences' as equivalent tokens.
Two options: Stemming and Lemmatizing

In [5]:
# loading package to stemming, here use Porter-Stemming
import nltk.stem.porter as pt
# loading package to Lemmatizing
from nltk.stem import WordNetLemmatizer  

# use WordNetLemmatizer to Lemmatizing
lemmatizer = WordNetLemmatizer()
# use Porter-Stemming to Stemming
pt_Stemmer = pt.PorterStemmer()

word_lemmatize = []
word_Stemming = []
# lemmatize or Stemming
for words in word_list_1:
    wl = []
    ws = []
    for word in words:
        # Stemming
        word_stem = pt_Stemmer.stem(word)
        ws.append(word_stem)

        # Lemmatizing
        word_lem = lemmatizer.lemmatize(word, 'n') # noun
        word_lem = lemmatizer.lemmatize(word_lem, 'v') # verb
        word_lem = lemmatizer.lemmatize(word_lem, 'a')  # adjective
        word_lem = lemmatizer.lemmatize(word_lem, 'r') # adverb
        word_lem = lemmatizer.lemmatize(word_lem, 's') # satellite adj
        wl.append(word_lem)

    word_lemmatize.append(wl)
    word_Stemming.append(ws)

## Pre-Processing IV: Multi-Word Phrases
Sometimes groups of individual tokens like "Bank Indonesia" or "text mining" have a specific meaning.
Detecting which phrase should be keep.
One ad-hoc strategy is to tabulate the frequency of all unique two-token(bigram) or three-token (trigram) phrases in the data, and convert themost common into a single token.


In [6]:
# combine two or three words as a phrase, tabulate their frequency
word_dict_lemmatize = {}
for words in word_lemmatize:
    for i in range(len(words)-1):
        for l in range(2, 4):
            phrase =  ' '.join(words[i:i+l]).lower()
            word_dict_lemmatize[phrase] = 1 + word_dict_lemmatize.get(phrase, 0)

# sort the phrase by their frequencies
twograms_tf = sorted(word_dict_lemmatize, key=lambda x: word_dict_lemmatize[x], reverse=True)

# convert the most common two-token or three-token into a single token
new_words = []
for words in word_lemmatize:
    n = len(words)
    i = 0
    nw = []
    while i < n:
        # l-tokens combination
        l = 3
        # check if the combination should be converted or not, 
        # check two-token and three-token
        while l > 1:
            # if the three-tokens is treated as phrase, dont treat as two-tokens 
            if ' '.join(words[i:i+l]).lower() in twograms_tf[:50]:
                # convert into a single token
                nw.append(' '.join(words[i:i+l]).lower())
                i += l
                break
            l -= 1
        # those dont convert
        if l == 1:
            nw.append(words[i].lower())
            i += 1
    new_words.append(nw)

# compute words and unique_words after processing
processed_terms_words = reduce(add, new_words)
all_process_terms = len(processed_terms_words)
all_process_unique = len(set(processed_terms_words))

In [7]:
words_stat = pd.DataFrame({'All_terms': [all_terms, all_unique], 'Processed': [all_process_terms, all_process_unique]})
words_stat.index = ['#total terms', '#unique terms']
words_stat

Unnamed: 0,All_terms,Processed
#total terms,16539,9276
#unique terms,4009,2489


In [8]:
# append tokens to dataframe
df_pr['words'] = new_words

In [9]:
df_pr.head()

Unnamed: 0,year,item,date,location,title,body,words
0,2017,1,2017-01-06,Washington D.C.,SEC Awards $5.5 Million to Whistleblower,The Securities and Exchange Commission today a...,"[securities exchange commission, today announc..."
1,2017,2,2017-01-09,Washington D.C.,SEC Charges Two Brokers With Defrauding Customers,The Securities and Exchange Commission today c...,"[securities exchange commission, today, charge..."
2,2017,3,2017-01-09,Washington D.C.,"Investment Adviser, Lawyer Settle Charges in S...",The Securities and Exchange Commission today a...,"[securities exchange commission, today announc..."
3,2017,4,2017-01-10,Washington D.C.,SEC: Port Authority Omitted Risks to Investors...,The Securities and Exchange Commission today a...,"[securities exchange commission, today announc..."
4,2017,5,2017-01-11,Washington D.C.,SEC Charges Government Contractor With Inadequ...,The Securities and Exchange Commission today a...,"[securities exchange commission, today announc..."


In [10]:
# import package to process date data
import datetime as dt
# extract month from date
df_pr['new_date'] = df_pr['date'].map(lambda x: dt.datetime.strftime(x, '%Y-%m'))

In [None]:
# groupby month and get docs together from each month
keywords_info = df_pr.groupby('new_date', as_index=False).apply(lambda x: [list(x['words'])])
keywords_info.columns = ['new_date', 'words']

In [12]:
for words in keywords_info['words']:
    # combine words into sentence
    doc_words = reduce(add, words)

## TF-IDF

In [13]:
# define a function to compute some weights of keyword
def get_weights(keyword, keywords_info):
      # record how many docs contain keyword
      Di = 0
      # record the frequency of keyword in the ith document
      TF = []
      Count = []
      g = []
      for words in keywords_info['words']:
            # combine sentences into sentence
            doc_words = reduce(add, words)
            if keyword in doc_words:
                  Di += 1
            words_frequency = doc_words.count(keyword)
            Count.append(words_frequency)
            Tf = words_frequency
            TF.append(1 + np.log(Tf) if Tf > 0 else 0)
            g.append(len(doc_words))
      D = keywords_info.shape[0]            
      IDF = np.log((D+1) / (1 + Di)) + 1
      TF_IDF = [tf*IDF for tf in TF]
      weights = pd.DataFrame()
      for i in range(D):
            weights['Count Doc'+str(i+1)] = [Count[i]]
            weights['Counts'+str(i+1)] = [g[i]]

            weights['TF Doc'+str(i+1)] = [TF[i]]
            weights['TF_IDF Doc'+str(i+1)] = [TF_IDF[i]]
      weights.index = [keyword]
      return weights

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# define a function to compute some weights of keyword with package sklearn
def get_weights_by_sklearn(keyword, keywords_info):

      corpus = []
      all_words = []
      for words in keywords_info['words']:
            # combine tokens into sentence for each doc
            doc = reduce(add, words)
            corpus.append(' '.join(doc))
            all_words += reduce(add, words)

      # initial TfidfTransformer and count vectorizer 
      count_vec = CountVectorizer(vocabulary=set(all_words))
      tfidf_vec = TfidfTransformer(use_idf=False)
      # get tfidf_matrix
      tf_matrix = count_vec.fit_transform(corpus).toarray()
      tfidf_matrix = tfidf_vec.fit_transform(count_vec.fit_transform(corpus)).toarray()

      weights = pd.DataFrame()
      for i in range(len(corpus)):
            # count_vec.vocabulary_[keyword]: get the index of keywords in vocabulary
            weights['Count Doc'+str(i+1)] = [tf_matrix[i,count_vec.vocabulary_[keyword]]]
            weights['Counts'+str(i+1)] = [sum(tf_matrix[i,:])]
            tf = tf_matrix[i,count_vec.vocabulary_[keyword]]
            weights['TF Doc'+str(i+1)] = [1 + np.log(tf) if tf > 0 else 0]
            weights['TF_IDF Doc'+str(i+1)] = [tfidf_matrix[i,count_vec.vocabulary_[keyword]]]
      weights.index = [keyword]
      return weights

In [15]:
# get the weights of disclosure
get_weights('disclosure', keywords_info)

Unnamed: 0,Count Doc1,Counts1,TF Doc1,TF_IDF Doc1,Count Doc2,Counts2,TF Doc2,TF_IDF Doc2,Count Doc3,Counts3,TF Doc3,TF_IDF Doc3,Count Doc4,Counts4,TF Doc4,TF_IDF Doc4,Count Doc5,Counts5,TF Doc5,TF_IDF Doc5
disclosure,2,2763,1.693147,1.693147,1,2448,1.0,1.0,2,974,1.693147,1.693147,4,1960,2.386294,2.386294,1,1131,1.0,1.0


In [16]:
get_weights_by_sklearn('disclosure', keywords_info)

Unnamed: 0,Count Doc1,Counts1,TF Doc1,TF_IDF Doc1,Count Doc2,Counts2,TF Doc2,TF_IDF Doc2,Count Doc3,Counts3,TF Doc3,TF_IDF Doc3,Count Doc4,Counts4,TF Doc4,TF_IDF Doc4,Count Doc5,Counts5,TF Doc5,TF_IDF Doc5
disclosure,2,2935,1.693147,0.010984,1,2485,1.0,0.007352,2,996,1.693147,0.030261,4,2028,2.386294,0.035823,1,1192,1.0,0.012912


## Word embedding

In [17]:
# import package to transfer word to vector
from gensim.models import word2vec
# define w2v model
model = word2vec.Word2Vec(reduce(add, keywords_info['words']), min_count=1)

In [18]:
all_words = []
for words in keywords_info['words']:
    # combine tokens into sentence for each doc
    all_words += reduce(add, words)

In [19]:
# import package to define custom key for sorting
from functools import cmp_to_key
# sort word by similar to disclosure
sort_words = sorted(set(all_words), key=cmp_to_key(lambda x, y: model.wv.similarity(x, 'disclosure') - model.wv.similarity(y, 'disclosure')), reverse=True)
# get top 20 words
most_sim_words = sort_words[:21]

In [20]:
most_sim_words

['disclosure',
 'include',
 'allege',
 'order',
 'seek',
 'use',
 'payment',
 'involve',
 'trade',
 'million',
 'regional office',
 'university',
 'act',
 'work',
 'ocie',
 'compliance',
 'period',
 'district',
 'continue',
 'account',
 'control']

In [21]:
# get top 20 words weights
for i, keyword in enumerate(most_sim_words):
    if i > 0:
        w = pd.concat([w, (get_weights(keyword, keywords_info))], axis=0)
        w1 = pd.concat([w1, (get_weights_by_sklearn(keyword, keywords_info))], axis=0)
    else:
        w = get_weights(keyword, keywords_info)
        w1 = get_weights_by_sklearn(keyword, keywords_info)

In [22]:
w

Unnamed: 0,Count Doc1,Counts1,TF Doc1,TF_IDF Doc1,Count Doc2,Counts2,TF Doc2,TF_IDF Doc2,Count Doc3,Counts3,TF Doc3,TF_IDF Doc3,Count Doc4,Counts4,TF Doc4,TF_IDF Doc4,Count Doc5,Counts5,TF Doc5,TF_IDF Doc5
disclosure,2,2763,1.693147,1.693147,1,2448,1.0,1.0,2,974,1.693147,1.693147,4,1960,2.386294,2.386294,1,1131,1.0,1.0
include,12,2763,3.484907,3.484907,9,2448,3.197225,3.197225,5,974,2.609438,2.609438,13,1960,3.564949,3.564949,8,1131,3.079442,3.079442
allege,1,2763,1.0,1.0,10,2448,3.302585,3.302585,6,974,2.791759,2.791759,4,1960,2.386294,2.386294,6,1131,2.791759,2.791759
order,19,2763,3.944439,3.944439,4,2448,2.386294,2.386294,3,974,2.098612,2.098612,14,1960,3.639057,3.639057,5,1131,2.609438,2.609438
seek,3,2763,2.098612,2.098612,7,2448,2.94591,2.94591,5,974,2.609438,2.609438,6,1960,2.791759,2.791759,3,1131,2.098612,2.098612
use,8,2763,3.079442,3.079442,11,2448,3.397895,3.397895,2,974,1.693147,1.693147,8,1960,3.079442,3.079442,4,1131,2.386294,2.386294
payment,6,2763,2.791759,3.923721,0,2448,0.0,0.0,0,974,0.0,0.0,7,1960,2.94591,4.140374,3,1131,2.098612,2.949526
involve,7,2763,2.94591,3.483013,2,2448,1.693147,2.001844,3,974,2.098612,2.481235,5,1960,2.609438,3.085195,0,1131,0.0,0.0
trade,14,2763,3.639057,3.639057,2,2448,1.693147,1.693147,3,974,2.098612,2.098612,12,1960,3.484907,3.484907,8,1131,3.079442,3.079442
million,30,2763,4.401197,4.401197,6,2448,2.791759,2.791759,8,974,3.079442,3.079442,11,1960,3.397895,3.397895,13,1131,3.564949,3.564949


In [23]:
w1

Unnamed: 0,Count Doc1,Counts1,TF Doc1,TF_IDF Doc1,Count Doc2,Counts2,TF Doc2,TF_IDF Doc2,Count Doc3,Counts3,TF Doc3,TF_IDF Doc3,Count Doc4,Counts4,TF Doc4,TF_IDF Doc4,Count Doc5,Counts5,TF Doc5,TF_IDF Doc5
disclosure,2,2935,1.693147,0.010984,1,2485,1.0,0.007352,2,996,1.693147,0.030261,4,2028,2.386294,0.035823,1,1192,1.0,0.012912
include,12,2935,3.484907,0.065905,9,2485,3.197225,0.066164,5,996,2.609438,0.075653,13,2028,3.564949,0.116425,8,1192,3.079442,0.103297
allege,1,2935,1.0,0.005492,10,2485,3.302585,0.073516,6,996,2.791759,0.090784,8,2028,3.079442,0.071646,8,1192,3.079442,0.103297
order,38,2935,4.637586,0.2087,4,2485,2.386294,0.029406,3,996,2.098612,0.045392,15,2028,3.70805,0.134336,12,1192,3.484907,0.154945
seek,3,2935,2.098612,0.016476,7,2485,2.94591,0.051461,5,996,2.609438,0.075653,6,2028,2.791759,0.053734,3,1192,2.098612,0.038736
use,8,2935,3.079442,0.043937,11,2485,3.397895,0.080867,2,996,1.693147,0.030261,8,2028,3.079442,0.071646,4,1192,2.386294,0.051648
payment,6,2935,2.791759,0.032953,0,2485,0.0,0.0,0,996,0.0,0.0,7,2028,2.94591,0.06269,3,1192,2.098612,0.038736
involve,7,2935,2.94591,0.038445,2,2485,1.693147,0.014703,3,996,2.098612,0.045392,5,2028,2.609438,0.044779,0,1192,0.0,0.0
trade,14,2935,3.639057,0.076889,2,2485,1.693147,0.014703,3,996,2.098612,0.045392,12,2028,3.484907,0.107469,8,1192,3.079442,0.103297
million,30,2935,4.401197,0.164763,6,2485,2.791759,0.044109,8,996,3.079442,0.121046,12,2028,3.484907,0.107469,13,1192,3.564949,0.167857
