# Initial Modelling: LDA and SVM

In [17]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import lda
from sklearn import svm
import matplotlib.pyplot as plt

import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb

In [78]:
#import nltk
#nltk.download()

In [18]:
# Read in the statements
statements = pd.read_csv("statements_with_labels.csv")
statements.head()

Unnamed: 0.1,Unnamed: 0,date,text,labels
0,0,1994-02-04,Chairman Alan Greenspan announced today that t...,1
1,1,1994-03-22,Chairman Alan Greenspan announced today that t...,0
2,2,1994-04-18,Chairman Alan Greenspan announced today that t...,1
3,3,1994-05-17,The Federal Reserve today announced two action...,0
4,4,1994-08-16,The Federal Reserve announced today the follow...,0


In [19]:
# Tokenize the statments
statements_tokenized = [word_tokenize(i) for i in statements['text']]
print(statements_tokenized[0][0:5])

['Chairman', 'Alan', 'Greenspan', 'announced', 'today']


### 1. Functions to run on feature sets

#### 1. (a) Clustering Algorithms

In [20]:
# LDA
def run_LDA(X, vocab, n_topics=8, n_iter=10):
    """ Function to run LDA clustering on featurized input 
    
    Inputs: 
        - featurized matrix
        - vocabulary to match those features
        - n_topics: number of clusters
        - n_iter: number of iterations
        
    """
    model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=1)
    model.fit(X)  
    topic_word = model.topic_word_ 
    n_top_words = 8
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

#### 1. (b) Sentiment Analysis

In [21]:
# SVM
def run_SVM(X_train, y_train, X_test, y_test):
    """ Function to run SVM on feature set to predict sentiment of statement 
    
    Inputs:
        - featurized matrix
        - y: labels
        
    Returns: 
        - score of model
    
    """
    # Initialize model
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    # Return score
    return clf.score(X_test, y_test)

In [22]:
# NB
def run_NB(X_train, y_train, X_test, y_test):
    """ Function to run SVM on feature set to predict sentiment of statement 
    
    Inputs:
        - featurized matrix
        - y: labels
        
    Returns: 
        - score of model
    
    """
    # Initialize model
    bnb = ms.GridSearchCV(
            nb.BernoulliNB(),
            param_grid={'alpha': np.logspace(-2., 2., 50)})
    bnb.fit(X_train, y_train)
    # Return score
    return bnb.score(X_test, y_test)

## 2. Create feature sets and run models

#### 2. (a) Basic

In [23]:
# Create vocabulary from statements
words = []
for i in statements_tokenized:
    for j in i:
        words.append(j.lower())
vocab_base = list(set(words))

In [24]:
len(vocab_base)

1879

In [25]:
# Determine longest statements
max_length = max([len(i) for i in statements_tokenized])
max_length

1092

In [26]:
# Get tokens by finding vocab indices
embeddings = []
for d in statements_tokenized:
    stmt = []
    for h in d:
        stmt.append(vocab_base.index(h.lower()))
    embeddings.append(stmt)
    if len(d) < max_length:
        # pad the shorter statements with 0s to make them the same length
        padding = max_length - len(d)
        stmt.extend(list(np.zeros(padding,dtype=np.int8)))

In [27]:
# Turn embeddings into np array
X_base = np.array([np.array(xi) for xi in embeddings])
print(X_base.shape)

(171, 1092)


In [82]:
# Run LDA
run_LDA(X=X_base, vocab=vocab_base)

INFO:lda:n_documents: 171
INFO:lda:vocab_size: 1092
INFO:lda:n_words: 64870773
INFO:lda:n_topics: 8
INFO:lda:n_iter: 10
INFO:lda:<0> log likelihood: -569755844
INFO:lda:<9> log likelihood: -514249488


Topic 0: persistently 5-1/2 owing pressure suggest sharply markets reserve
Topic 1: growthmore persistently remained 28-day 5-1/2 websites pressure changed
Topic 2: growthmore persistently remained duke created ¼ determining pressure
Topic 3: shown show vote storm-related attributable action restrictive going
Topic 4: persistently owing measured.nonetheless be balance harvey quite gradually
Topic 5: growthmore 2014 solid remained jr extension respect tenders
Topic 6: persistently growthmore remained duke pressure 5-1/2 necessary subdued
Topic 7: foreign owing weigh measured.nonetheless persistently olson balance executed


In [13]:
#t = pd.DataFrame(topic_words)
#t.to_csv("topic_words1.csv")

In [28]:
# Run SVM

# Split data
X_train, X_test, y_train, y_test = ms.train_test_split(X_base, statements['labels'], test_size=0.2)

acc = run_SVM(X_train, y_train, X_test, y_test)
print("Model 1: SVM on Basic Vectorized Matrix")
print("Accuracy: " + str(acc))

Model 1: SVM on Basic Vectorized Matrix
Accuracy: 0.628571428571


In [29]:
acc2 = run_NB(X_train, y_train, X_test, y_test)
print("NB Accuracy: " + str(acc2))

NB Accuracy: 0.571428571429


### 1. Remove numbers


In [30]:
import re, string
 
def clearup(s, chars):
    return re.sub('[%s]' % chars, '', s).lower()
 
s = 'This is %a t1e22st !st4ring6 w.it6h 87embed766ded punct,:ua-tion and nu=mbe]rS6.'
 
print(clearup(s, string.punctuation+string.digits))

statements_nonums = []
for j in statements_tokenized:
    state = []
    for i in j:
        i = clearup(i, string.punctuation+string.digits)
        if i:
            state.append(i)
    statements_nonums.append(state)

this is a test string with embedded punctuation and numbers


In [31]:
# Create new vocabulary
words = []
for i in statements_nonums:
    for j in i:
        words.append(j.lower())
vocab_nonums = list(set(words))

len(vocab_nonums)

1771

In [32]:
# How many words were lost?
print(str(len(vocab_base) - len(vocab_nonums)) + ' words removed from vocab')

108 words removed from vocab


In [33]:
# Determine longest statements
max_length = max([len(i) for i in statements_nonums])
max_length

967

In [34]:
# Get tokens by finding vocab indices
embeddings = []
for d in statements_nonums:
    stmt = []
    for h in d:
        stmt.append(vocab_nonums.index(h.lower()))
    embeddings.append(stmt)
    if len(d) < max_length:
        padding = max_length - len(d)
        stmt.extend(list(np.zeros(padding,dtype=np.int8)))
        
# Turn embeddings into np array
X_nonums = np.array([np.array(xi) for xi in embeddings])
print(X_nonums.shape)

(171, 967)


In [35]:
# Run LDA
run_LDA(X=X_nonums, vocab=vocab_nonums)

INFO:lda:n_documents: 171
INFO:lda:vocab_size: 967
INFO:lda:n_words: 65237604
INFO:lda:n_topics: 8
INFO:lda:n_iter: 10
INFO:lda:<0> log likelihood: -566405708
INFO:lda:<9> log likelihood: -512556364


Topic 0: requests next targeted cleveland began extended believes strengthened
Topic 1: promising directive eased seeks fundamentals banks fostering foreign
Topic 2: than having range strained closely that most eroded
Topic 3: accompanying willing discussed directors confirms pay facilitate primary
Topic 4: next eased requests promising directive medium effect strengthened
Topic 5: directive pushed disparity goods stress overtime purchases mainly
Topic 6: promising fall region firming better meanwhile eased next
Topic 7: next requests targeted implied extended lacker rose exacerbated


In [36]:
# Run SVM

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_nonums, statements['labels'], test_size=0.2)

acc = run_SVM(X_train, y_train, X_test, y_test)
print("Model 1: SVM on matrix with numbers removed")
print("Accuracy: " + str(acc))

Model 1: SVM on matrix with numbers removed
Accuracy: 0.542857142857


### 2. tfidVectorizor

In [37]:
y = statements['labels']

# Implement vectorizer
tf = text.TfidfVectorizer()
X = tf.fit_transform(statements['text'])
print(X.shape)

(171, 1764)


In [38]:
p = 100 * X.nnz / float(X.shape[0] * X.shape[1])
print(f"Each sample has ~{p:.2f}% non-zero features.")

Each sample has ~11.18% non-zero features.


In [39]:
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=.2)

In [40]:
bnb = ms.GridSearchCV(
    nb.BernoulliNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
bnb.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-02,   1.20679e-02,   1.45635e-02,   1.75751e-02,
         2.12095e-02,   2.55955e-02,   3.08884e-02,   3.72759e-02,
         4.49843e-02,   5.42868e-02,   6.55129e-02,   7.90604e-02,
         9.54095e-02,   1.15140e-01,   1.38950e-01,   1.67683e-01,
         2....    3.90694e+01,   4.71487e+01,   5.68987e+01,   6.86649e+01,
         8.28643e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [41]:
print(bnb.score(X_test, y_test))

0.571428571429


### 3. Bigrams

In [42]:
from sklearn.feature_extraction.text import CountVectorizer


In [43]:
# Build bigram
cv = CountVectorizer(analyzer='char_wb', ngram_range=(2,2), min_df = 0)
X = cv.fit_transform(statements['text']).toarray()
print(X.shape)

(171, 673)


In [44]:
tf_vec = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

tf = tf_vec.fit_transform(statements['text'])
print(tf.shape)

(171, 1144)


In [45]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [48]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=8, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda_model.fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vec.get_feature_names()
print_top_words(lda, tf_feature_names, 10)


Topics in LDA model:
Topic #0: inflation action policy growth chairman basis approved price needed stability
Topic #1: central bank arrangements national reserve actions markets funding banks strains
Topic #2: inflation policy labor longer securities term conditions agency funds monetary
Topic #3: stability action board contained term decided evans losses points billion
Topic #4: inflation growth implied pressures chairman resource sustain policy donald warsh
Topic #5: inflation securities reserve policy funds agency conditions operations monetary range
Topic #6: action reserve approved growth board discount basis today banks 25
Topic #7: markets growth chairman jr credit policy donald monetary kohn period



https://nlp.stanford.edu/IR-book/html/htmledition/support-vector-machines-and-machine-learning-on-documents-1.html
https://gate.ac.uk/sale/nle-svm/svm-ie.pdf
https://www.quora.com/How-do-I-train-a-SVM-classifier-from-text-examples
