In [28]:
import logging
import itertools
import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

def head(stream, n=10):
    """ Return first n elements of """
    return list(itertools.islice(stream, n))

In [29]:
from gensim.utils import smart_open, simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import json
import io

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_comment_dump(dump_file):
    """ Yield each comment from a json comment dump, as a `(subreddit, tokens)` 2-tuple."""
    for line in open(dump_file):
        comment = json.loads(line)
        tokens = tokenize(comment["body"])
        if len(tokens) < 3:
            continue
        yield comment["subreddit"], tokens

In [30]:
comment_stream = iter_comment_dump('./all_years/obama-train.json')
for sub, tokens in head(comment_stream):
    print sub, tokens[:10] 

AdviceAnimals [u'thanks', u'obama', u'responding']
SRSsucks [u'literally', u'assertion', u'true', u'true', u'women', u'created', u'patriarchy', u'hypothetical', u'way', u'literally']
worldnews [u'budget', u'cuts', u'thanks', u'obama']
Marvel [u'wow', u'going', u'restrain', u'clicking', u'night', u'thanks', u'obama', u'edit', u'clicked', u'questions']
Buttcoin [u'gt', u'thanks', u'obama', u'welcome', u'barrack', u'obama']
pics [u'baby', u'know', u'fucks', u'cookie', u'bake', u'shit', u'hope', u'hundreds', u'kids', u'flooding']
nba [u'oh', u'thanks', u'obama']
AskReddit [u'remember', u'super', u'secret', u'private', u'article', u'year', u'businesses', u'use', u'obamacare', u'excuse']
politics [u'important', u'thing', u'know', u'want', u'corporate', u'interests', u'shine', u'foreign', u'policy', u'domestic']
hearthstone [u'yeh', u'tried', u'play', u'yugioh', u'cards', u'drew', u'crayon', u'wouldnt', u'let', u'play']


In [31]:
doc_stream = (tokens for _, tokens in comment_stream)
%time id2word_comments = gensim.corpora.Dictionary(doc_stream)
print(id2word_comments)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(22618 unique tokens: [u'deferment', u'woods', u'spiders', u'hanging', u'woody']...)
INFO : adding document #20000 to Dictionary(32295 unique tokens: [u'deferment', u'woods', u'spiders', u'hanging', u'woody']...)
INFO : built Dictionary(35902 unique tokens: [u'deferment', u'woods', u'spiders', u'hanging', u'woody']...) from 25154 documents (total 450294 corpus positions)


CPU times: user 3.76 s, sys: 182 ms, total: 3.94 s
Wall time: 4.06 s
Dictionary(35902 unique tokens: [u'deferment', u'woods', u'spiders', u'hanging', u'woody']...)


In [32]:
id2word_comments.filter_extremes(no_below=0, no_above=0.8)
print(id2word_comments)

INFO : discarding 2 tokens: [(u'thanks', 25145), (u'obama', 24783)]...
INFO : keeping 35900 tokens which were in no less than 0 and no more than 20123 (=80.0%) documents
INFO : resulting dictionary: Dictionary(35900 unique tokens: [u'deferment', u'askew', u'woods', u'spiders', u'hanging']...)


Dictionary(35900 unique tokens: [u'deferment', u'askew', u'woods', u'spiders', u'hanging']...)


In [33]:
class CommentCorpus(object):
    def __init__(self, dump_file, dictionary, clip=None):
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip = clip
    def __iter__(self):
        self.subreddits = []
        for sub, tokens in head(iter_comment_dump(self.dump_file), self.clip):
            self.subreddits.append(sub)
            yield self.dictionary.doc2bow(tokens)
    def __len__(self):
        return self.clip

comment_corpus = CommentCorpus('./all_years/obama-train.json', id2word_comments)
vector = next(iter(comment_corpus))
print(vector)

[(11467, 1)]


In [34]:
%time gensim.corpora.MmCorpus.serialize('./topic_models/obama_bow.mm', comment_corpus)

INFO : storing corpus in Matrix Market format to ./topic_models/obama_bow.mm
INFO : saving sparse matrix to ./topic_models/obama_bow.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : PROGRESS: saving document #4000
INFO : PROGRESS: saving document #5000
INFO : PROGRESS: saving document #6000
INFO : PROGRESS: saving document #7000
INFO : PROGRESS: saving document #8000
INFO : PROGRESS: saving document #9000
INFO : PROGRESS: saving document #10000
INFO : PROGRESS: saving document #11000
INFO : PROGRESS: saving document #12000
INFO : PROGRESS: saving document #13000
INFO : PROGRESS: saving document #14000
INFO : PROGRESS: saving document #15000
INFO : PROGRESS: saving document #16000
INFO : PROGRESS: saving document #17000
INFO : PROGRESS: saving document #18000
INFO : PROGRESS: saving document #19000
INFO : PROGRESS: saving document #20000
INFO : PROGRESS: saving document #210

CPU times: user 4.78 s, sys: 272 ms, total: 5.05 s
Wall time: 5.26 s


In [35]:
mm_corpus = gensim.corpora.MmCorpus('./topic_models/obama_bow.mm')
print(mm_corpus)

INFO : loaded corpus index from ./topic_models/obama_bow.mm.index
INFO : initializing corpus reader from ./topic_models/obama_bow.mm
INFO : accepted corpus with 25164 documents, 35900 features, 332620 non-zero entries


MmCorpus(25164 documents, 35900 features, 332620 non-zero entries)


In [36]:
vector = next(iter(mm_corpus))
print(vector)

[(11467, 1.0)]


In [37]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=10, id2word=id2word_comments, passes=4)

INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 2.78551532033e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 4 passes over the supplied corpus of 4000 documents, updating model once every 2000 documents, evaluating perplexity every 4000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/4000
INFO : merging changes from 2000 documents into a model of 4000 documents
INFO : topic #3 (0.100): 0.019*"gt" + 0.009*"welcome" + 0.008*"barrack" + 0.006*"like" + 0.004*"griff" + 0.004*"marty" + 0.004*"com" + 0.004*"think" + 0.003*"http" + 0.003*"work"
INFO : topic #7 (0.100): 0.013*"com" + 0.013*"http" + 0.009*"www" + 0.008*"comments" + 0.008*"reddit" + 0.007*"funny" + 0.006*"people" + 0.006*"like" + 0.005*"self" + 0.004*"think"
INFO : topic #4 (0.100): 0.009*"com" + 0.008*"http" + 0.007*"people" + 0.006*"www" + 0.006*"like" + 0.005*"comments" + 0.005*"know" + 0.005*"reddit"

CPU times: user 35.5 s, sys: 1.32 s, total: 36.8 s
Wall time: 36.9 s


In [38]:
_ = lda_model.print_topics(-1) # Print most important words in each topic

INFO : topic #0 (0.100): 0.013*"com" + 0.012*"reddit" + 0.011*"bitcoin" + 0.010*"http" + 0.010*"www" + 0.008*"comments" + 0.006*"link" + 0.003*"amp" + 0.003*"thread" + 0.003*"https"
INFO : topic #1 (0.100): 0.006*"yeah" + 0.005*"wait" + 0.005*"like" + 0.004*"fucking" + 0.004*"right" + 0.003*"going" + 0.003*"www" + 0.003*"doc" + 0.003*"reddit" + 0.003*"http"
INFO : topic #2 (0.100): 0.006*"like" + 0.004*"edit" + 0.003*"shit" + 0.003*"http" + 0.003*"imgur" + 0.003*"com" + 0.002*"oh" + 0.002*"germany" + 0.002*"gif" + 0.002*"blame"
INFO : topic #3 (0.100): 0.036*"gt" + 0.017*"welcome" + 0.013*"barrack" + 0.007*"meme" + 0.005*"com" + 0.005*"work" + 0.005*"http" + 0.004*"reason" + 0.004*"imgur" + 0.004*"mean"
INFO : topic #4 (0.100): 0.012*"people" + 0.010*"like" + 0.006*"know" + 0.006*"think" + 0.006*"good" + 0.005*"want" + 0.005*"time" + 0.004*"ve" + 0.004*"way" + 0.004*"things"
INFO : topic #5 (0.100): 0.004*"bad" + 0.003*"great" + 0.003*"think" + 0.003*"time" + 0.003*"edit" + 0.002*"sarc

In [39]:
# Stack transforms (Latent Semantic Analysis on TFIDF)
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_comments)
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_comments, num_topics=200)

INFO : collecting document frequencies
INFO : PROGRESS: processing document #0
INFO : PROGRESS: processing document #10000
INFO : PROGRESS: processing document #20000
INFO : calculating IDF weights for 25164 documents and 35899 features (332620 matrix non-zeros)
INFO : using serial LSI version on this node
INFO : updating model with new documents


CPU times: user 1.95 s, sys: 67 ms, total: 2.02 s
Wall time: 2.05 s


INFO : preparing a new chunk of documents
INFO : using 100 extra samples and 2 power iterations
INFO : 1st phase: constructing (35900, 300) action matrix
INFO : orthonormalizing (35900, 300) action matrix
INFO : 2nd phase: running dense svd on (300, 20000) matrix
INFO : computing the final decomposition
INFO : keeping 200 factors (discarding 14.576% of energy spectrum)
INFO : processed documents up to #20000
INFO : topic #0(19.796): 0.606*"barrack" + 0.590*"welcome" + 0.529*"gt" + 0.041*"davemills" + 0.019*"corncobjohnson" + 0.016*"lesweden" + 0.012*"bot" + 0.008*"says" + 0.007*"ftfy" + 0.006*"com"
INFO : topic #1(11.256): -0.728*"bot" + -0.324*"yeah" + -0.116*"comments" + -0.112*"like" + -0.104*"subreddit" + -0.103*"wait" + -0.103*"automoderator" + -0.103*"pu" + -0.103*"automatically" + -0.103*"performed"
INFO : topic #2(10.908): -0.865*"yeah" + 0.388*"bot" + -0.061*"oh" + 0.060*"automoderator" + 0.060*"pu" + 0.059*"performed" + 0.059*"fr" + 0.059*"moderators" + 0.059*"compose" + 0.05

CPU times: user 38.1 s, sys: 4.02 s, total: 42.1 s
Wall time: 23.9 s


In [40]:
print(next(iter(lsi_model[tfidf_model[mm_corpus]])))

[(0, 0.0002179994335000898), (1, 0.0029780594872817752), (2, 0.0003011833383196768), (3, 0.001672098518136828), (4, 0.00017254863369103708), (5, -1.9106118729552668e-05), (6, 0.0018322795580526165), (7, 0.00063517318736097976), (8, 0.00062298387650565562), (9, 0.00046996789007005499), (10, 9.0512447555181314e-05), (11, 0.00025975223270507963), (12, -0.0003896589029783597), (13, 0.00065972817648281973), (14, -0.0026826231190943827), (15, 0.0018707953086169547), (16, 1.4688116895818741e-05), (17, -7.0408977442287476e-05), (18, -0.00016534848327647367), (19, 0.0043034298662401535), (20, -0.0011401639865943173), (21, 0.0015329014737278019), (22, -0.0015337308362312484), (23, -0.0021223233416963957), (24, 0.0028289524189796687), (25, 0.0018589414655918726), (26, -0.001069223586516228), (27, 0.0023942593193543495), (28, 0.0032779610753700976), (29, -0.0021313935290583836), (30, 0.00083526876208800762), (31, 0.00028467599416923182), (32, -0.0029140069770172412), (33, -0.00068461216448099558),

In [41]:
%time gensim.corpora.MmCorpus.serialize('./topic_models/obama_tfidf.mm', tfidf_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize('./topic_models/obama_lsa.mm', lsi_model[tfidf_model[mm_corpus]])
%time gensim.corpora.MmCorpus.serialize('./topic_models/obama_lda.mm', lda_model[mm_corpus])

INFO : storing corpus in Matrix Market format to ./topic_models/obama_tfidf.mm
INFO : saving sparse matrix to ./topic_models/obama_tfidf.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : PROGRESS: saving document #4000
INFO : PROGRESS: saving document #5000
INFO : PROGRESS: saving document #6000
INFO : PROGRESS: saving document #7000
INFO : PROGRESS: saving document #8000
INFO : PROGRESS: saving document #9000
INFO : PROGRESS: saving document #10000
INFO : PROGRESS: saving document #11000
INFO : PROGRESS: saving document #12000
INFO : PROGRESS: saving document #13000
INFO : PROGRESS: saving document #14000
INFO : PROGRESS: saving document #15000
INFO : PROGRESS: saving document #16000
INFO : PROGRESS: saving document #17000
INFO : PROGRESS: saving document #18000
INFO : PROGRESS: saving document #19000
INFO : PROGRESS: saving document #20000
INFO : PROGRESS: saving document 

CPU times: user 6.68 s, sys: 432 ms, total: 7.11 s
Wall time: 7.4 s


INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : PROGRESS: saving document #4000
INFO : PROGRESS: saving document #5000
INFO : PROGRESS: saving document #6000
INFO : PROGRESS: saving document #7000
INFO : PROGRESS: saving document #8000
INFO : PROGRESS: saving document #9000
INFO : PROGRESS: saving document #10000
INFO : PROGRESS: saving document #11000
INFO : PROGRESS: saving document #12000
INFO : PROGRESS: saving document #13000
INFO : PROGRESS: saving document #14000
INFO : PROGRESS: saving document #15000
INFO : PROGRESS: saving document #16000
INFO : PROGRESS: saving document #17000
INFO : PROGRESS: saving document #18000
INFO : PROGRESS: saving document #19000
INFO : PROGRESS: saving document #20000
INFO : PROGRESS: saving document #21000
INFO : PROGRESS: saving document #22000
INFO : PROGRESS: saving document #23000
INFO : PROGRESS: saving document #24000
INFO : PROGRE

CPU times: user 45.2 s, sys: 2.95 s, total: 48.2 s
Wall time: 50.7 s


INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : PROGRESS: saving document #4000
INFO : PROGRESS: saving document #5000
INFO : PROGRESS: saving document #6000
INFO : PROGRESS: saving document #7000
INFO : PROGRESS: saving document #8000
INFO : PROGRESS: saving document #9000
INFO : PROGRESS: saving document #10000
INFO : PROGRESS: saving document #11000
INFO : PROGRESS: saving document #12000
INFO : PROGRESS: saving document #13000
INFO : PROGRESS: saving document #14000
INFO : PROGRESS: saving document #15000
INFO : PROGRESS: saving document #16000
INFO : PROGRESS: saving document #17000
INFO : PROGRESS: saving document #18000
INFO : PROGRESS: saving document #19000
INFO : PROGRESS: saving document #20000
INFO : PROGRESS: saving document #21000
INFO : PROGRESS: saving document #22000
INFO : PROGRESS: saving document #23000
INFO : PROGRESS: saving document #24000
INFO : PROGRESS: saving document #25000
INFO : sa

CPU times: user 33.6 s, sys: 1.07 s, total: 34.6 s
Wall time: 34.9 s


In [42]:
tfidf_corpus = gensim.corpora.MmCorpus('./topic_models/obama_tfidf.mm')
lsi_corpus = gensim.corpora.MmCorpus('./topic_models/obama_lsa.mm')
lda_corpus = gensim.corpora.MmCorpus('./topic_models/obama_lda.mm')
print(tfidf_corpus)
print(lsi_corpus)
print(lda_corpus)

INFO : loaded corpus index from ./topic_models/obama_tfidf.mm.index
INFO : initializing corpus reader from ./topic_models/obama_tfidf.mm
INFO : accepted corpus with 25164 documents, 35900 features, 332620 non-zero entries
INFO : loaded corpus index from ./topic_models/obama_lsa.mm.index
INFO : initializing corpus reader from ./topic_models/obama_lsa.mm
INFO : accepted corpus with 25164 documents, 200 features, 5014593 non-zero entries
INFO : loaded corpus index from ./topic_models/obama_lda.mm.index
INFO : initializing corpus reader from ./topic_models/obama_lda.mm
INFO : accepted corpus with 25164 documents, 10 features, 201500 non-zero entries


MmCorpus(25164 documents, 35900 features, 332620 non-zero entries)
MmCorpus(25164 documents, 200 features, 5014593 non-zero entries)
MmCorpus(25164 documents, 10 features, 201500 non-zero entries)


In [43]:
text = "Trump won and now Putin controls the government. Thanks, Obama!"
bow_vector = id2word_comments.doc2bow(tokenize(text))
print([(id2word_comments[id], count) for id, count in bow_vector])


[(u'putin', 1), (u'won', 1), (u'government', 1), (u'controls', 1), (u'trump', 1)]


In [44]:
lda_vector = lda_model[bow_vector]
print(lda_vector)
# print the document's most prominent LDA topic: 
print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

[(0, 0.016670444548322127), (1, 0.4289209421753305), (2, 0.01666839986469169), (3, 0.016668436573002401), (4, 0.016671940902791098), (5, 0.016668705112579035), (6, 0.016668703670370075), (7, 0.016667957609206887), (8, 0.016670069614534089), (9, 0.43772439992917206)]
0.008*"people" + 0.006*"like" + 0.005*"bot" + 0.004*"better" + 0.004*"got" + 0.003*"know" + 0.003*"years" + 0.003*"country" + 0.003*"need" + 0.003*"seriously"


In [45]:
tfidf_vector = tfidf_model[bow_vector]
print(tfidf_vector)

[(9630, 0.4178697047896979), (12449, 0.30007103723758366), (19947, 0.25953702681478913), (24292, 0.5004726335535591), (34258, 0.6461501042375802)]


In [46]:
lda_model.save('./topic_models/lda_obama.model')
lsi_model.save('./topic_models/lsi_obama.model')
tfidf_model.save('./topic_models/tfidf_obama.model')
id2word_comments.save('./topic_models/obama.dictionary')

INFO : saving LdaState object under ./topic_models/lda_obama.model.state, separately None
INFO : saved ./topic_models/lda_obama.model.state
INFO : saving LdaModel object under ./topic_models/lda_obama.model, separately ['expElogbeta', 'sstats']
INFO : not storing attribute id2word
INFO : storing np array 'expElogbeta' to ./topic_models/lda_obama.model.expElogbeta.npy
INFO : not storing attribute state
INFO : not storing attribute dispatcher
INFO : saved ./topic_models/lda_obama.model
INFO : saving Projection object under ./topic_models/lsi_obama.model.projection, separately None
INFO : saved ./topic_models/lsi_obama.model.projection
INFO : saving LsiModel object under ./topic_models/lsi_obama.model, separately None
INFO : not storing attribute projection
INFO : not storing attribute dispatcher
INFO : saved ./topic_models/lsi_obama.model
INFO : saving TfidfModel object under ./topic_models/tfidf_obama.model, separately None
INFO : saved ./topic_models/tfidf_obama.model
INFO : saving Dic

In [47]:
# select top 50 words for each of the 20 LDA topics
top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
print(top_words)

[[u'com', u'reddit', u'bitcoin', u'http', u'www', u'comments', u'link', u'amp', u'thread', u'https', u'bot', u'government', u'snowden', u'xb', u'karpeles', u'illegal', u'courtesy', u'jojo', u'mojo', u'bomb', u'anymore', u'youtube', u'stuff', u'says', u'donating', u'summon', u'time', u'watch', u'joseph', u'rules', u'message', u'space', u'hiring', u'kill', u'pony', u'progress', u'np', u'focus', u'noodles', u'apos', u'linked', u'links', u'respect', u'compose', u'gov', u'oh', u'glock', u'vote', u'follow', u'programming'], [u'yeah', u'wait', u'like', u'fucking', u'right', u'going', u'www', u'doc', u'reddit', u'http', u'happened', u'think', u'com', u'time', u'won', u'change', u'oh', u'trying', u'rage', u'watch', u'know', u'okay', u'marty', u'believe', u'wouldn', u'point', u'added', u'sure', u'came', u'comments', u'people', u'app', u'freedom', u'look', u'government', u'away', u'work', u'great', u'amazon', u'th', u'twice', u'woman', u'thing', u'youtube', u'wasn', u'maybe', u'sides', u'gun', u'

In [48]:
# get all top 50 words in all 20 topics, as one large set
all_words = set(itertools.chain.from_iterable(top_words))

print("Can you spot the misplaced word in each topic?")

# for each topic, replace a word at a different index, to make it more interesting
replace_index = np.random.randint(0, 10, lda_model.num_topics)

replacements = []
for topicno, words in enumerate(top_words):
    other_words = all_words.difference(words)
    replacement = np.random.choice(list(other_words))
    replacements.append((words[replace_index[topicno]], replacement))
    words[replace_index[topicno]] = replacement
    print("%i: %s" % (topicno, ' '.join(words[:10])))

Can you spot the misplaced word in each topic?
0: com reddit bitcoin dream www comments link amp thread https
1: yeah add like fucking right going www doc reddit http
2: like edit shit http china com oh germany gif blame
3: gt welcome barrack meme wasn work http reason imgur mean
4: people like obamacare think good want time ve way things
5: bad mojo think time edit sarcastic training change word news
6: insurance time year old care like pay ve got people
7: http com reddit www comments funny gifs yr like medical
8: gt imgur wow like ve right oh thought work government
9: genuine like bot better got know years country need seriously


In [49]:
print("Actual replacements were:")
print(list(enumerate(replacements)))

Actual replacements were:
[(0, (u'http', u'dream')), (1, (u'wait', u'add')), (2, (u'imgur', u'china')), (3, (u'com', u'wasn')), (4, (u'know', u'obamacare')), (5, (u'great', u'mojo')), (6, (u'health', u'old')), (7, (u'ago', u'medical')), (8, (u'people', u'imgur')), (9, (u'people', u'genuine'))]


In [50]:
# Really didn't work because of very small size of documents

# evaluate on 1k documents **not** used in LDA training
doc_stream = (tokens for _, tokens in iter_comment_dump('./all_years/obama-test.json'))  # generator
test_docs = list(doc_stream)
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_comments.doc2bow(tokens[: len(tokens) / 2])] for tokens in test_docs]
    part2 = [model[id2word_comments.doc2bow(tokens[len(tokens) / 2 :])] for tokens in test_docs]

    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between 10,000 random parts (lower is better):")    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

print("LDA results:")
intra_inter(lda_model, test_docs)
print("LSI results:")
intra_inter(lsi_model, test_docs)

LDA results:
average cosine similarity between corresponding parts (higher is better):
0.464171597579
average cosine similarity between 10,000 random parts (lower is better):
0.376446967811
LSI results:
average cosine similarity between corresponding parts (higher is better):
0.0813486492936
average cosine similarity between 10,000 random parts (lower is better):
0.0158405848284
