# Initial Modelling: LDA and SVM

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import lda
from sklearn import svm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
#import nltk
#nltk.download()

In [2]:
# Read in the statements
statements = pd.read_csv("statements_with_labels.csv")
statements.head()

Unnamed: 0.1,Unnamed: 0,date,text,labels
0,0,1994-02-04,Chairman Alan Greenspan announced today that t...,1
1,1,1994-03-22,Chairman Alan Greenspan announced today that t...,0
2,2,1994-04-18,Chairman Alan Greenspan announced today that t...,1
3,3,1994-05-17,The Federal Reserve today announced two action...,0
4,4,1994-08-16,The Federal Reserve announced today the follow...,0


In [3]:
# Tokenize the statments
statements_tokenized = [word_tokenize(i) for i in statements['text']]

### 1. LDA

In [4]:
# Create vocabulary from statements
words = []
for i in statements_tokenized:
    for j in i:
        words.append(j.lower())
vocab_base = list(set(words))

In [5]:
len(vocab_base)

1879

In [6]:
# Determine longest statements
max_length = max([len(i) for i in statements_tokenized])
max_length

1092

In [8]:
# Get tokens by finding vocab indices
embeddings = []
for d in statements_tokenized:
    stmt = []
    for h in d:
        stmt.append(vocab_base.index(h.lower()))
    embeddings.append(stmt)
    if len(d) < max_length:
        padding = max_length - len(d)
        stmt.extend(list(np.zeros(padding,dtype=np.int8)))
    

In [9]:
# Turn embeddings into np array
X = np.array([np.array(xi) for xi in embeddings])
print(X.shape)

(171, 1092)


In [9]:
# Run LDA model
model = lda.LDA(n_topics=5, n_iter=700, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 171
INFO:lda:vocab_size: 1092
INFO:lda:n_words: 75237140
INFO:lda:n_topics: 5
INFO:lda:n_iter: 700
INFO:lda:<0> log likelihood: -625306577
INFO:lda:<10> log likelihood: -566377507
INFO:lda:<20> log likelihood: -526344831
INFO:lda:<30> log likelihood: -521930268
INFO:lda:<40> log likelihood: -521096066
INFO:lda:<50> log likelihood: -521294667
INFO:lda:<60> log likelihood: -521426356
INFO:lda:<70> log likelihood: -521490812
INFO:lda:<80> log likelihood: -521353186
INFO:lda:<90> log likelihood: -521223351
INFO:lda:<100> log likelihood: -521131788
INFO:lda:<110> log likelihood: -521017657
INFO:lda:<120> log likelihood: -520857315
INFO:lda:<130> log likelihood: -520670322
INFO:lda:<140> log likelihood: -520508173
INFO:lda:<150> log likelihood: -520351314
INFO:lda:<160> log likelihood: -520207803
INFO:lda:<170> log likelihood: -520048442
INFO:lda:<180> log likelihood: -519911142
INFO:lda:<190> log likelihood: -519818859
INFO:lda:<200> log likelihood: -519698197
INFO:lda

Topic 0: trillion unusually normalization signify 1 having suggested 18
Topic 1: timing advances facilitating 3/4 holiday indicators over end
Topic 2: still-robust nearly top total data mcdonough dudley confirms
Topic 3: pay obtain strengthened risen so weakness choice growth
Topic 4: tools 25-basis reaffirmed and shortly observed in preferred


In [13]:
t = pd.DataFrame(topic_words)
t.to_csv("topic_words1.csv")

#### 1. (a) Remove numbers


In [10]:
import re, string
 
def clearup(s, chars):
    return re.sub('[%s]' % chars, '', s).lower()
 
s = 'This is %a t1e22st !st4ring6 w.it6h 87embed766ded punct,:ua-tion and nu=mbe]rS6.'
 
print(clearup(s, string.punctuation+string.digits))

statements_nonums = []
for j in statements_tokenized:
    state = []
    for i in j:
        i = clearup(i, string.punctuation+string.digits)
        if i:
            state.append(i)
    statements_nonums.append(state)

this is a test string with embedded punctuation and numbers


In [11]:
# Create new vocabulary
words = []
for i in statements_nonums:
    for j in i:
        words.append(j.lower())
vocab_nonums = list(set(words))

len(vocab_nonums)

1771

In [12]:
# How many words were lost?
print(str(len(vocab_base) - len(vocab_nonums)) + ' words removed from vocab')

108 words removed from vocab


In [13]:
# Determine longest statements
max_length = max([len(i) for i in statements_nonums])
max_length

967

In [14]:
# Get tokens by finding vocab indices
embeddings = []
for d in statements_nonums:
    stmt = []
    for h in d:
        stmt.append(vocab_nonums.index(h.lower()))
    embeddings.append(stmt)
    if len(d) < max_length:
        padding = max_length - len(d)
        stmt.extend(list(np.zeros(padding,dtype=np.int8)))
        
# Turn embeddings into np array
X = np.array([np.array(xi) for xi in embeddings])
print(X.shape)

(171, 967)


In [16]:
# Run LDA model
model2 = lda.LDA(n_topics=5, n_iter=200, random_state=1)
model2.fit(X)  # model.fit_transform(X) is also available
topic_word = model2.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab_nonums)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 171
INFO:lda:vocab_size: 967
INFO:lda:n_words: 64027649
INFO:lda:n_topics: 5
INFO:lda:n_iter: 200
INFO:lda:<0> log likelihood: -526134615
INFO:lda:<10> log likelihood: -485313855
INFO:lda:<20> log likelihood: -442209375
INFO:lda:<30> log likelihood: -437375939
INFO:lda:<40> log likelihood: -437512488
INFO:lda:<50> log likelihood: -437686952
INFO:lda:<60> log likelihood: -437766753
INFO:lda:<70> log likelihood: -437769266
INFO:lda:<80> log likelihood: -437742172
INFO:lda:<90> log likelihood: -437690209
INFO:lda:<100> log likelihood: -437623648
INFO:lda:<110> log likelihood: -437565424
INFO:lda:<120> log likelihood: -437494498
INFO:lda:<130> log likelihood: -437436903
INFO:lda:<140> log likelihood: -437380204
INFO:lda:<150> log likelihood: -437318508
INFO:lda:<160> log likelihood: -437266535
INFO:lda:<170> log likelihood: -437205502
INFO:lda:<180> log likelihood: -437156619
INFO:lda:<190> log likelihood: -437099943
INFO:lda:<199> log likelihood: -437056814


Topic 0: dislocation necessary fail experience uncertain convincingly jerome nonetheless
Topic 1: timing persistent toll attributable reserve likelihood power kashkari
Topic 2: damping evans no fostered he stable quarter damped
Topic 3: shortfalls seizingup third check carry theprices conducive closer
Topic 4: judgment ensuring longerrun constrained taking must steps interestsensitive


### 2. SVM

In [17]:
df = statements['text']
y = statements['labels']

In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
clf = svm.SVC()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
clf.score(X_test, y_test)

0.62857142857142856

In [20]:
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

https://nlp.stanford.edu/IR-book/html/htmledition/support-vector-machines-and-machine-learning-on-documents-1.html
https://gate.ac.uk/sale/nle-svm/svm-ie.pdf
https://www.quora.com/How-do-I-train-a-SVM-classifier-from-text-examples
