In [2]:
import os
import sys

# add the 'src' directory as one where we can import modules
# os.pardir refers to parent directory. 
# src_dir = os.path.join(os.getcwd(), '../src') => works the same
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 2



In [18]:
import gensim
import gzip
from features import preprocessor
from nltk.corpus import stopwords
import numpy as np
import math
from sklearn import metrics

In [34]:
def load_corpus(data_file, label_index, text_index):
    target= []
    data = []

    with gzip.open(data_file, 'r') as f:
        for line in f:
            line_arr = line.decode('utf8').split('\t')
            data.append(line_arr[text_index])
            target.append(line_arr[label_index])
    f.close()

    return [data, target]

def docs2vecs ( docs , dictionary ):
    # docs is a list of documents returned by corpus2docs.
    # dictionary is a gensim.corpora.Dictionary object.
    vecs1 = [dictionary.doc2bow(doc) for doc in docs]
    return vecs1

def save_topics(filename):
    with open(filename, 'w') as f:
        for topic in topics:
            f.write(str(topic[0]) + '\t' + str(topic[1]) + '\n')
    f.close()

def get_entropy(topic_prob_dist, num_topics):
    entropy = 0
    for topic, prob in topic_prob_dist:
        ent = prob * math.log(prob, 2)
        entropy += ent
    entropy = entropy / num_topics * -1.0
    return entropy 

def corpus_ent_stats(entropy_list):
    stats_dict = dict()
    stats_dict['mean'] = np.average(entropy_list)
    stats_dict['med'] = np.median(entropy_list)
    stats_dict['std'] = np.std(entropy_list)
    stats_dict['var'] = np.var(entropy_list)
    stats_dict['max'] = np.amax(entropy_list)
    stats_dict['min'] = np.amin(entropy_list)
    return stats_dict

def predict(entropy_list, threshold):
    target = []
    for entropy in entropy_list:
        if entropy >= threshold:
            target.append('1')
        else:
            target.append('0')
    return target



In [4]:
# input corpus for topic modelling
data, labels = load_corpus('../data/train/objectivity-large.2-label.tsv.gz', 0, 1)
stop_words = stopwords.words('english')
stop_words += ['would', 'said', 'say', 'year', 'day', 'also', 'first', 'last', 'one', 'two', 'people', 'told', 'new', 'could', 'singapore', 'three', 'may', 'like', 'world', 'since','mr','time','even','make','many','take','still','well','get','want','made','go','much','dr']

In [5]:
# preprocess data to remove stopwords, punctuation, special characters etc
# returns list of list of words. Each inner list of words represents words in a row
pp_data = [preprocessor.process_data(row, stop_words).split() for row in data]

In [6]:
data_dictionary = gensim.corpora.Dictionary(pp_data)
print(data_dictionary)



In [7]:
data_dictionary.filter_extremes(no_below=3, no_above=0.1)
print(data_dictionary)

data_vec = docs2vecs(pp_data, data_dictionary)



In [8]:
lda_model = gensim.models.LdaModel(corpus=data_vec, id2word=data_dictionary, num_topics=15)

In [9]:
topics = lda_model.show_topics(15,20)
for topic in topics:
    print (topic)
    print ('\n')

(0, '0.005*"transport" + 0.003*"growth" + 0.003*"health" + 0.003*"cent" + 0.002*"freedom" + 0.002*"govt" + 0.002*"beer" + 0.002*"jobs" + 0.002*"sector" + 0.002*"household" + 0.002*"poverty" + 0.002*"economy" + 0.002*"religious" + 0.002*"gay" + 0.002*"employment" + 0.002*"agency" + 0.002*"nac" + 0.002*"politics" + 0.002*"girl" + 0.002*"operation"')


(1, '0.012*"amos" + 0.006*"yee" + 0.005*"freedom" + 0.005*"tharman" + 0.003*"food" + 0.003*"employer" + 0.003*"letter" + 0.003*"lky" + 0.002*"democracy" + 0.002*"town" + 0.002*"prison" + 0.002*"growth" + 0.002*"university" + 0.002*"pink" + 0.002*"ahpetc" + 0.002*"dodwell" + 0.002*"manpower" + 0.002*"reformative" + 0.002*"cheng" + 0.002*"yeo"')


(2, '0.004*"amos" + 0.002*"jobs" + 0.002*"return" + 0.002*"cent" + 0.002*"growth" + 0.002*"mps" + 0.002*"reformative" + 0.002*"offence" + 0.002*"yee" + 0.002*"investment" + 0.002*"transport" + 0.002*"model" + 0.002*"teacher" + 0.002*"driver" + 0.002*"operation" + 0.002*"moe" + 0.002*"arrested" + 0.0

In [10]:
# get the list of topic distributions for each row in corpus
topic_dist = []
for vec in data_vec:
    topic_dist.append(lda_model.get_document_topics(vec))
print (topic_dist[0])

[(1, 0.99457363156275669)]


In [11]:
# get list of entropy for each row in corpus
topic_entropy = []
for dist in topic_dist:
    topic_entropy.append(get_entropy(dist,15))

In [36]:
topic_entropy = np.array(topic_entropy)
ent_stats =  corpus_ent_stats(topic_entropy)
print(ent_stats)

{'mean': 0.033342708102435004, 'med': 0.0029452243619548083, 'std': 0.041137961253838132, 'var': 0.0016923318561222873, 'max': 0.16642530860430718, 'min': 0.00018191246522387857}


In [38]:
predicted_labels = predict(topic_entropy, ent_stats['med'])
print (len(predicted_labels))

465


In [39]:
cm = metrics.confusion_matrix(labels, predicted_labels)
precision = metrics.precision_score(labels, predicted_labels, pos_label='1',)
recall = metrics.recall_score(labels, predicted_labels, pos_label='1',)
f1 = metrics.f1_score(labels, predicted_labels, pos_label='1', average='binary')

In [40]:
print('confusion matrix: predicted (horizontal), true (vertical) \n\n%s\n' % cm)
print('precision: %s' % precision)
print('recall: %s' % recall)
print('binary f1: %s' % f1)

# confusion matrix: predicted (horizontal), true (vertical)

confusion matrix: predicted (horizontal), true (vertical) 

[[143 125]
 [ 89 108]]

precision: 0.463519313305
recall: 0.548223350254
binary f1: 0.502325581395


In [42]:
# manual attempt to test f1 measure
%run ../src/models/objectivity_train.py --data ../data/train/objectivity-large.2-label.tsv.gz --num_topics 15 --topics ../data/results/topics.txt --threshold 50 

{'mean': 0.036508058122546934, 'med': 0.011883416468149261, 'std': 0.042227443539393651, 'var': 0.0017831569878726788, 'max': 0.18830012855153253, 'min': 0.00012907556529956297}
confusion matrix: predicted (horizontal), true (vertical) 

[[153 115]
 [ 99  98]]

precision: 0.460093896714
recall: 0.497461928934
binary f1: 0.478048780488


## Automate topic number and percentile selection 
Finds the number of topic and percentile threshold that gives best results based on f1 measure

In [3]:
%run ../src/models/objectivity_train.py --data ../data/train/objectivity-large.2-label.tsv.gz --automate True --verbose True --num_topics_range '5,20' --threshold_range '20,30,40,50,60,70,80,90'

original dictionary
dictionary after reducing redundant features
num_topics_range -> ['5', '20']
thresholds -> ['20', '30', '40', '50', '60', '70', '80', '90']
Iteration no.: 1
372 / 465 or 0.8 labelled as objective.
{'num_topics': 5, 'threshold': 20, 'precision': 0.38709677419354838, 'recall': 0.73096446700507611, 'f1': 0.50615114235500869}
Iteration no.: 2
325 / 465 or 0.7 labelled as objective.
{'num_topics': 5, 'threshold': 30, 'precision': 0.38153846153846155, 'recall': 0.62944162436548223, 'f1': 0.47509578544061304}
Iteration no.: 3
279 / 465 or 0.6 labelled as objective.
{'num_topics': 5, 'threshold': 40, 'precision': 0.43727598566308246, 'recall': 0.61928934010152281, 'f1': 0.51260504201680679}
Iteration no.: 4
233 / 465 or 0.5 labelled as objective.
{'num_topics': 5, 'threshold': 50, 'precision': 0.47210300429184548, 'recall': 0.55837563451776651, 'f1': 0.51162790697674421}
Iteration no.: 5
186 / 465 or 0.4 labelled as objective.
{'num_topics': 5, 'threshold': 60, 'precision':

In [3]:
# manual attempt to test f1 measure
%run ../src/models/objectivity_train.py --data ../data/train/objectivity-large.2-label.tsv.gz --num_topics 18 --topics ../data/results/topics.txt --threshold 50 

original dictionary
dictionary after reducing redundant features
mean 0.0308858278396
med 0.0233144021413
std 0.0339182491461
var 0.00115044762514
max 0.144636974846
min 0.000104774847883
confusion matrix: predicted (horizontal), true (vertical) 

[[149 119]
 [ 83 114]]

precision: 0.489270386266
recall: 0.578680203046
binary f1: 0.53023255814
