In [1]:
import stop_words
import logging
import pickle
stop_w = stop_words.get_stop_words("english")
from sklearn.datasets import fetch_20newsgroups
import numpy as np

from lda2vec_old import preprocess, Corpus

logging.basicConfig()

# Fetch data
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset='train', remove=remove).data
# Remove tokens with these substrings
bad = set(["ax>", '`@("', '---', '===', '^^^'])


def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))

# Preprocess data
max_length = 10000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(clean(d)) for d in texts]


In [2]:
from gensim.models.word2vec import KeyedVectors
fn_wordvc = "GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(fn_wordvc, binary=True)

In [3]:
uniq = set()
for text in texts:
    for word in unicode(text).split():
        uniq.add(word)

In [4]:
w2v_vocab = set(w2v.vocab)

oov = w2v_vocab.difference(uniq)

tup = w2v.similar_by_word("iphone", topn=100)

In [5]:

text1 = unicode(" ".join(i for i,j in tup[:40] if i in oov))

text2 = unicode(" ".join(i for i,j in tup[40:] if i in oov))
    
docs = [text1, text2]
    

In [6]:
texts = [text1] + [text2] + texts

In [7]:
docs = []
for text in texts:
    docs.append(unicode(" ".join(word for word in text.split() if word in w2v.vocab)))

In [8]:

tokens, vocab = preprocess.tokenize(docs, max_length, merge=False,
                                    n_threads=4)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=30)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words


In [9]:
doc_ids = np.arange(pruned.shape[0])
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
assert flattened.min() >= 0
# Fill in the pretrained word vectors
n_dim = 300
fn_wordvc = 'GoogleNews-vectors-negative300.bin'
vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc)


2 <SKIP>  -->  SKIP
26 .  -->  上
1177 !  -->  上
1789 batf  -->  Hatf
2563 svga  -->  sega
2861 nra  -->  Kıraç
2999 '  -->  上
3133 vram  -->  Cram
3288 isbn  -->  ison
3395 sunos  -->  sumos
3474 fpu  -->  feu
3476 irq  -->  iri
3549 atf  -->  etf
3645 qur'an  -->  Qur'an
3753 ok.  -->  Lk.
3794 mfm  -->  Xfm
3805 gfci  -->  Sci
3845 us.  -->  usâ_€
4155 wsh  -->  wjh
4305 ssf  -->  nsf
4748 jpl  -->  Ipl
4809 truetype  -->  teletype
4857 rsi  -->  tsi
4900 ncd  -->  İnce
4997 nejm  -->  Sejm
5174 xor  -->  Hor
5245 uart  -->  Cart
5350 wwii  -->  wii
5371 :  -->  上
5483 ssto  -->  sto
5677 rll  -->  rall
5726 ...  -->  В.В.
5907 gnd  -->  andÂ
6061 mmwr  -->  mawr
6106 isdn  -->  isda
6115 fips  -->  fins
6217 )  -->  上
6247 8-  -->  8Â_½
6332 ncsa  -->  nsa
6507 ..  -->  В.В.
6640 sspx  -->  spx
6670 sdio  -->  Odio
6856 usaf  -->  Ausaf
6880 vlsi  -->  visi
6888 xdm  -->  xpm
6911 clv  -->  clm
6951 ssrt  -->  sert
7042 pkp  -->  pnp
7124 svr4  -->  snr
7134 cview  -->  cView
7534 o

In [10]:
# Save all of the preprocessed files
pickle.dump(vocab, open('new_new_final/vocab.pkl', 'w'))
pickle.dump(corpus, open('new_new_final/corpus.pkl', 'w'))
np.save("new_new_final/flattened", flattened)
np.save("new_new_final/doc_ids", doc_ids)
np.save("new_new_final/pruned", pruned)
np.save("new_new_final/bow", bow)
np.save("new_new_final/vectors", vectors)

In [7]:
texts[0]

u'Iphone ipad iPhone iphone4 Ipad apple_iphone iphones iPhone_3GS 3gs iPhone4 Iphone_3G htc android_phones droid_x ipod ipod_touch iphone_3g samsung 3GS iPad itouch mytouch tmo iPhone_3gs iPhone_3g MyTouch_4G iphone_3gs Apple_Iphone iPhone_3G appstore iphone_3G sony_ericsson iphone_3GS iphone_4g Motorola_Atrix_4G HTC_Droid_Incredible Apple_ipad Iphone_OS treo'

In [8]:
texts[1]

u'Moto_Droid Droid_X. Iphones verizon Blackberry_Storm Tmobile 3Gs HTC_HD2 iphone_ipad razr HTC_EVO_4G Ipod_Touch Droid iPhone_#/#G ipads Atrix_4G HTC_Thunderbolt Nokia_N8 symbian Droid_X HTC_Hero iPhone_4g macbook itunes xoom HTC_Desire_HD Palm_Pre Nexus_One blackberry_pearl macbook_pro iphone_unlock IPhone XOOM Droid_Incredible ios_#.#.# Motorola_Droid_X apple_ipad iphone_ipod tmobile Samsung_Omnia MT4G nokia Nokia_N##_Mini locked_bootloader EVO_4G Iphone_3GS iPhone_3Gs gizmodo Motorola_DROID webos iPhones iPhone_3G_3Gs iPod_Touch_4G Motorola_Dext iOS ipad2 cingular Treo_###w'