In [2]:
from nltk.corpus import stopwords
import spacy
nlp = spacy.load("en", disable=['ner', 'parser'])
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [4]:
with open('abstracts.txt', 'r') as abs:
    data = abs.read().split('\n')


data_words = list(sent_to_words(data))
print(data_words[0])

['current', 'machine', 'learning', 'algorithms', 'can', 'be', 'easily', 'fooled', 'by', 'adversarial', 'examples', 'one', 'possible', 'solution', 'path', 'is', 'to', 'make', 'models', 'that', 'use', 'confidence', 'thresholding', 'to', 'avoid', 'making', 'mistakes', 'such', 'models', 'refuse', 'to', 'make', 'prediction', 'when', 'they', 'are', 'not', 'confident', 'of', 'their', 'answer', 'we', 'propose', 'to', 'evaluate', 'such', 'models', 'in', 'terms', 'of', 'tradeoff', 'curves', 'with', 'the', 'goal', 'of', 'high', 'success', 'rate', 'on', 'clean', 'examples', 'and', 'low', 'failure', 'rate', 'on', 'adversarial', 'examples', 'existing', 'untargeted', 'attacks', 'developed', 'for', 'models', 'that', 'do', 'not', 'use', 'confidence', 'thresholding', 'tend', 'to', 'underestimate', 'such', 'models', 'vulnerability', 'we', 'propose', 'the', 'maxconfidence', 'family', 'of', 'attacks', 'which', 'are', 'optimal', 'in', 'variety', 'of', 'theoretical', 'settings', 'including', 'one', 'realisti

In [20]:
len(data_words)

300

In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=2) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words])

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
print(data_words_nostops[0])
#
# # Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_bigrams(data_words_nostops)
print(data_words_trigrams[0])



['current', 'machine', 'learning', 'algorithms', 'easily', 'fooled', 'adversarial', 'examples', 'one', 'possible', 'solution', 'path', 'make', 'models', 'confidence', 'thresholding', 'avoid', 'making', 'mistakes', 'models', 'refuse', 'make', 'prediction', 'confident', 'answer', 'propose', 'evaluate', 'models', 'terms', 'tradeoff', 'curves', 'goal', 'high', 'success', 'rate', 'clean', 'examples', 'low', 'failure', 'rate', 'adversarial', 'examples', 'existing', 'untargeted', 'attacks', 'developed', 'models', 'confidence', 'thresholding', 'tend', 'underestimate', 'models', 'vulnerability', 'propose', 'maxconfidence', 'family', 'attacks', 'optimal', 'variety', 'theoretical', 'settings', 'including', 'one', 'realistic', 'setting', 'attacks', 'linear', 'models', 'experiments', 'show', 'attack', 'attains', 'good', 'results', 'practice', 'show', 'simple', 'defenses', 'able', 'perform', 'well', 'mnist', 'cifar', 'contributing', 'previous', 'calls', 'mnist', 'retired', 'benchmarking', 'dataset',

In [11]:
# # Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
#
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[['current', 'machine_learn', 'algorithm', 'easily', 'fool', 'adversarial_exampl', 'possible', 'solution', 'path', 'make', 'model', 'confidence', 'threshold', 'avoid', 'make', 'mistake', 'model', 'refuse', 'make', 'prediction', 'confident', 'answer', 'propose', 'evaluate', 'model', 'term', 'tradeoff', 'curve', 'goal', 'high_success', 'rate', 'clean_example', 'low', 'failure', 'rate', 'adversarial_exampl', 'exist', 'untargeted', 'attack', 'develop', 'model', 'confidence', 'thresholding', 'tend', 'underestimate', 'model', 'vulnerability', 'propose', 'maxconfidence', 'family', 'attack', 'optimal', 'variety', 'theoretical', 'setting', 'include', 'realistic', 'set', 'attack', 'linear', 'model', 'experiments_show', 'attack', 'attain', 'good', 'result', 'practice', 'show', 'simple', 'defense', 'able', 'perform_well', 'mnist_cifar', 'contribute', 'previous', 'call', 'mnist', 'retire', 'benchmark', 'dataset', 'adversarial', 'robustness', 'research', 'release', 'code', 'evaluation', 'part', 'cle

In [16]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.040*"model" + 0.023*"datum" + 0.015*"machine_learn" + '
  '0.013*"poisoning_attack" + 0.012*"access" + 0.011*"algorithm" + '
  '0.010*"training_data" + 0.009*"system" + 0.009*"privacy" + 0.009*"policy"'),
 (1,
  '0.023*"backdoor" + 0.013*"system" + 0.009*"audio" + 0.008*"trigger" + '
  '0.008*"voice_command" + 0.006*"exploit" + 0.006*"privacy" + 0.006*"build" + '
  '0.006*"channel" + 0.006*"human"'),
 (2,
  '0.031*"attack" + 0.028*"model" + 0.017*"adversarial_example" + '
  '0.015*"adversarial_exampl" + 0.009*"show" + 0.008*"detection" + '
  '0.008*"different" + 0.007*"propose" + 0.007*"approach" + 0.007*"example"'),
 (3,
  '0.014*"system" + 0.014*"dnn" + 0.012*"security" + 0.011*"model" + '
  '0.008*"attack" + 0.008*"face_recognition" + 0.007*"deep_learn" + '
  '0.007*"feature" + 0.007*"ml" + 0.006*"human"'),
 (4,
  '0.021*"object" + 0.019*"method" + 0.016*"problem" + 0.015*"cnn" + '
  '0.014*"lead" + 0.012*"agent" + 0.012*"segmentation" + 0.011*"image" + '
  '0.011*"exhaust

In [17]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.4581499371564215


In [18]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.41470010757513753


In [19]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
