In [1]:
import os
import gensim
import re
import json
from zipfile import ZipFile
from gensim.models import Word2Vec
import logging
import random
import numpy as np
from string import punctuation

# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

import spacy
nlp = spacy.load('en') # Installed with python -m spacy download en

# Switch on logging for gensim model training
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
DIM = 300
VOCAB_MAX = 10000
EPOCHS = 3
MODEL_NAME = '10K_300_txtfiles_newspapers.model' # If model exists it is loaded
USE_SPACY = False
USE_LEMMAS = False # Only valid with USE_SPACY = True
USE_ENTITIES = False # Only valid with USE_SPACY = True
REMOVE_PUNCTUATION = True

In [3]:
def load_archive_sources(pathfiles_dir, archives_dir):
    archives = []
    for pathfile_fname in os.listdir(pathfiles_dir):
        if pathfile_fname.endswith('.txt'):
            with open(pathfiles_dir + '/' + pathfile_fname) as pathfile:
                for line in pathfile.readlines():
                    # Strip trailing spaces, single quotes, comma, and newline
                    line_stripped = re.findall(r"'(.*?)'", line, re.DOTALL)
                    if line_stripped: # Strip empty lines
                        assert len(line_stripped) == 1
                        archive_fname = archives_dir + '/' + line_stripped[0]
                        if os.path.isfile(archive_fname):
                            archives += [archive_fname]
                        else:
                            raise FileExistsError('The following data file does not exist: ' + archive_fname)
    return archives
    
    
def load_from_txtfile_sources(txtfiles_dir):
    txtfiles = []
    for txtfile_fname in os.listdir(txtfiles_dir):
        if txtfile_fname.endswith('.txt'):
            txtfiles.append(txtfiles_dir + '/' + txtfile_fname)
    return txtfiles

In [4]:
# Jupyter download hack: run !tar chvfz ntext_files_clean.tar.gz text_files_clean in new notebook

# pathfiles_dir = 'corpora/filepaths'
# archives_dir = 'corpora/data-new'
# archives = load_archive_sources(pathfiles_dir, archives_dir)

# Files from 20181105_1452_us-newspapers-humanities-250-dedupe
txtfiles_dir = 'corpora/text_files_clean_newspapers_250'
txtfiles = load_from_txtfile_sources(txtfiles_dir)

In [5]:
class Sentences(object):
    def __init__(self, files, remove_punctuation=True, use_spacy=False, use_lemmas=False, use_entities=False):
        self.files = files
        # Unfortunately SpaCy is really slow when called for every text, so make it optional for testing
        self.use_spacy = use_spacy
        self.use_lemmas = use_lemmas
        self.use_entities = use_entities
        self.remove_punctuation = remove_punctuation
        self._punctuation_regex = re.compile('[%s]' % re.escape(punctuation))
 
    def __iter__(self):
        for fname in self.files:
            if fname.endswith('.zip'):
                # WE1S JSOn archive
                with ZipFile(fname, 'r') as archive:
                    for json_fname in archive.namelist():
                        if not json_fname.startswith('README'): # Exclude weird empty README files in archives
                            lines = archive.open(json_fname).readlines()
                            json_ = ''.join([line.decode('UTF-8') for line in lines])
                            text = json.loads(json_)['content']
                            for sentence in self.yield_sentences_from_text(text):
                                yield(sentence)
            elif fname.endswith('.txt'):
                # Mallet plain text
                with open(fname, 'r') as txtfile:
                    lines = txtfile.readlines()
                    text = ''.join(lines)
                    for sentence in self.yield_sentences_from_text(text):
                        yield(sentence)
            else:
                raise Exception('File seems to be neither Mallet plain text file or WE1S JSON archive.')
    
    def yield_sentences_from_text(self, text):
        if self.use_spacy:
            doc = nlp(text)

            # Detect and merge entitites
            if (self.use_entities):
                for ent in doc.ents:
                    ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.root.ent_type_)

            # Detect sentences
            for sentence in doc.sents: 
                words = []
                # Detect tokens
                for token in sentence:
                    # Add lowercase token to list if it is not punctuation or whitespace
                    if ((not token.is_punct) or (not self.remove_punctuation)) and (not token.is_space):
                        if (self.use_lemmas):
                            if token.lemma_ == '-PRON-':
                                word = token.text
                            else:
                                word = token.lemma_
                        else:
                            word = token.text
                        words.append(word.lower())
                yield words

        else:
            for sentence in text.split('.'):
                words = []
                if self.remove_punctuation:
                    sentence = self._punctuation_regex.sub('', sentence)
                sentence = sentence.lower()
                for word in sentence.split():
                    word = word.strip()
                    words.append(word)
                yield words

In [6]:
gen = Sentences(txtfiles, 
                use_spacy=USE_SPACY, 
                use_lemmas=USE_LEMMAS, 
                use_entities=USE_ENTITIES, 
                remove_punctuation=REMOVE_PUNCTUATION)

In [7]:
# Load or train model
model = None # Make sure we're not accidentally continuing training
if os.path.isfile(MODEL_NAME):
    model = model = Word2Vec.load(MODEL_NAME)
else:
    # https://radimrehurek.com/gensim/models/word2vec.html
    # max_final_vocab needs gensim >3.5.0 and min_count=1 to work
    # sg: use SkipGram
    # hs: use hierarchical softmax
    model = Word2Vec(gen, 
                     size=DIM, 
                     max_final_vocab=VOCAB_MAX, 
                     min_count=1, 
                     window=5, 
                     workers=8, 
                     iter=EPOCHS, 
                     sg=1, 
                     hs=1)
    model.save(MODEL_NAME)

2018-11-07 06:34:28,717 : INFO : collecting all words and their counts
2018-11-07 06:34:28,718 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-07 06:34:28,764 : INFO : PROGRESS: at sentence #10000, processed 122455 words, keeping 15962 word types
2018-11-07 06:34:28,810 : INFO : PROGRESS: at sentence #20000, processed 244529 words, keeping 23554 word types
2018-11-07 06:34:28,857 : INFO : PROGRESS: at sentence #30000, processed 366648 words, keeping 30501 word types
2018-11-07 06:34:28,905 : INFO : PROGRESS: at sentence #40000, processed 495772 words, keeping 35257 word types
2018-11-07 06:34:28,950 : INFO : PROGRESS: at sentence #50000, processed 615408 words, keeping 39687 word types
2018-11-07 06:34:28,994 : INFO : PROGRESS: at sentence #60000, processed 731235 words, keeping 43895 word types
2018-11-07 06:34:29,037 : INFO : PROGRESS: at sentence #70000, processed 844077 words, keeping 47198 word types
2018-11-07 06:34:29,085 : INFO : PROGRESS: at 

2018-11-07 06:34:32,290 : INFO : PROGRESS: at sentence #720000, processed 8935701 words, keeping 159844 word types
2018-11-07 06:34:32,347 : INFO : PROGRESS: at sentence #730000, processed 9064296 words, keeping 160843 word types
2018-11-07 06:34:32,390 : INFO : PROGRESS: at sentence #740000, processed 9171784 words, keeping 162123 word types
2018-11-07 06:34:32,444 : INFO : PROGRESS: at sentence #750000, processed 9302676 words, keeping 163320 word types
2018-11-07 06:34:32,493 : INFO : PROGRESS: at sentence #760000, processed 9424167 words, keeping 164447 word types
2018-11-07 06:34:32,537 : INFO : PROGRESS: at sentence #770000, processed 9533797 words, keeping 165671 word types
2018-11-07 06:34:32,590 : INFO : PROGRESS: at sentence #780000, processed 9661286 words, keeping 166688 word types
2018-11-07 06:34:32,642 : INFO : PROGRESS: at sentence #790000, processed 9796239 words, keeping 167984 word types
2018-11-07 06:34:32,691 : INFO : PROGRESS: at sentence #800000, processed 992120

2018-11-07 06:35:15,334 : INFO : EPOCH 2 - PROGRESS: at 98.60% examples, 337249 words/s, in_qsize 13, out_qsize 0
2018-11-07 06:35:15,423 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-07 06:35:15,429 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-07 06:35:15,463 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-07 06:35:15,493 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-07 06:35:15,515 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-07 06:35:15,518 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-07 06:35:15,519 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-07 06:35:15,527 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-07 06:35:15,527 : INFO : EPOCH - 2 : training on 10416851 raw words (7278223 effective words) took 21.5s, 338458 effective words/s
2018-11-07 06:35:16

In [8]:
print(len(model.wv.vocab))

9957


In [9]:
print(model.wv.most_similar('humanities', topn=10))

2018-11-07 06:35:35,441 : INFO : precomputing L2-norms of word weight vectors


[('sciences', 0.5206330418586731), ('socialsciences', 0.4732058048248291), ('mathematics', 0.4585806131362915), ('arts', 0.4397183954715729), ('behavioral', 0.40036070346832275), ('science', 0.3988550901412964), ('rockefellerfoundation', 0.3805062174797058), ('philosophy', 0.3769809305667877), ('bioethics', 0.37596845626831055), ('polytechnic', 0.3609108328819275)]


  if np.issubdtype(vec.dtype, np.int):


In [10]:
# Calculate cosine similarity between each possible pair of points in a cluster, 
# then average over total number of pairs
def cluster_density(cluster):
    total_similarity = 0
    for a in cluster:
        for b in cluster:
            if a != b:
                cosine_similarity = model.wv.n_similarity([a], [b])
                total_similarity += cosine_similarity
    average_similarity = total_similarity / (len(cluster)**2)
    return average_similarity

# Random entry from model vocabulary
def random_vocab():
    return random.choice(list(model.wv.vocab))

In [11]:
# Load topic model from keys
topics = []
with open('corpora/keys.txt', 'r') as f:
    for line in f.readlines():
        topic = line.split()
        topics.append(topic)

In [12]:
# Compare density of topics to random topic
for topic in topics:
    try:
        words = topic[3:] # First two are number and value
        random_topic = [random_vocab() for i in range(len(words))]
        density_topic = cluster_density(words)
        density_random_topic = cluster_density(random_topic)
        if density_topic < density_random_topic: # Only print when less dense than random topic
            print(topic)
            print(density_topic)
            print(random_topic)
            print(density_random_topic)
    # Because we are limiting teh vocabulary, in very rare cases we might encounter an out-of-vocabulary word
    # Usually the topic top-10 words should also be somehwhat common words and thus not be pruned
    except KeyError:
        pass

In [13]:
# Order topics with "humanities" by density
density_topic = {}
for topic in topics:
    try:
        words = topic[3:]
        if 'humanities' in words:
            density = cluster_density(words)
            density_topic[density] = topic
    # Result of max. vocabulary limit
    except KeyError:
        pass
for density in sorted(density_topic):
    print(density, density_topic[density])

0.14945123366505644 ['132', '0.02035', 'trump', 'president', 'white', 'house', 'center', 'protest', 'kennedy', 'charlottesville', 'protesters', 'year', 'american', 'friday', 'letter', 'rally', 'committee', 'members', 'week', 'humanities', 'event']
0.16588607743198489 ['40', '0.11178', 'center', 'event', 'public', 'humanities', 'series', 'events', 'conference', 'program', 'information', 'discussion', 'lecture', 'day', 'held', 'talk', 'open', 'council', 'year', 'call', 'free']
0.16706047476165825 ['164', '0.16792', 'officials', 'members', 'week', 'year', 'group', 'meeting', 'made', 'decision', 'humanities', 'asked', 'board', 'plan', 'month', 'called', 'director', 'letter', 'expected', 'announced', 'statement']
0.17458591692977482 ['62', '0.06347', 'arts', 'art', 'cultural', 'artists', 'culture', 'music', 'nea', 'organizations', 'theater', 'community', 'creative', 'support', 'dance', 'performing', 'humanities', 'city', 'people', 'center', 'artistic']
0.17609455501430749 ['74', '0.03046', 

In [14]:
# Plot with matplotlib
def plot_pyplot(vectors, words):
    
    print('Applying PCA')
    vectors = PCA(n_components=100).fit_transform(vectors)
    print('Applying T-SNE')
    vectors = TSNE(n_components=2, learning_rate=100, perplexity=50).fit_transform(vectors)
    
    fig_size = plt.rcParams["figure.figsize"]
    fig_size[0] = 100
    fig_size[1] = 100
    plt.rcParams["figure.figsize"] = fig_size

    def wordscatter(x, y, word, ax):
        color = 'black'
        if word in topicm100_4_noweights:
            color = 'red' 
        ax.annotate(word, xy=(x, y), xytext=(x, y), color=color, alpha=0.4)
        ax.update_datalim(np.column_stack([x, y]))
        ax.autoscale()

    fig, ax = plt.subplots()
    for i, word in enumerate(words):
        wordscatter(vectors[i,0], vectors[i,1], word, ax=ax)
    #ax.scatter(vectors[:,0], vectors[:,1])

    plt.savefig('tsne.png', dpi = 100)

# Prepare to be plotted with TensorBoard
def plot_tb(vectors, words):
    with open('data.tsv', 'w+') as f:
        for vector in vectors.tolist():
            for point in vector:
                f.write(str(point) + '\t')
            f.write('\n')

    with open('metadata.tsv', 'w+') as f:
        for word in words:
            f.write(word + '\n')

In [15]:
# Separate words and vectors
words = []
vectors = np.zeros((len(model.wv.vocab), DIM))
for i, word in enumerate(model.wv.index2word):
    vectors[i] = model.wv[word]
    words.append(word)

plot_tb(vectors, words)