In [284]:
#Turn on Logging
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [285]:
from gensim import corpora, models, similarities
import pandas as pd
import os
os.getcwd()
#os.chdir()

'/Users/whs/Documents/Fun With ML/Gensim Tutorials'

# Import Raw Text

In [293]:
#Read in 'documents'

raw1 = 'The cute kitten purred and watched the Stark girl.'
raw2 = ' After losing interest the cute furry cat purred and meowed.'
raw3 = ' Sly yet silent, the cute kitten meowed and she noticed.' 
raw4 = ' The loud furry dog ran and bit at air.'

raw_text = str(raw1 + raw2 + raw3 + raw4)
raw_text
raw_text2 = str(raw1 + raw4)

# Process text w/ SpaCy

## Initialize 'en' tokenization pipeline

In [287]:
import spacy
from spacy.en import English
nlp = spacy.load("en")

## Push all documents through it

In [296]:
docs = nlp(raw_text, parse=True)
docs2 = nlp(raw_text2, parse=True)
docs

The cute kitten purred and watched the Stark girl. After losing interest the cute furry cat purred and meowed. Sly yet silent, the cute kitten meowed and she noticed. The loud furry dog ran and bit at air.

## Interesting Text Analysis Stuff

In [118]:
#Now with our text all spacy tokened, we can do cool stuff
#https://github.com/cytora/pycon-nlp-in-10-lines

# Get first token of the processed document
token = docs[0]
print(token)

# Print sentences (one sentence per line)
for sent in docs.sents:
    print(sent)


The
The cute kitten purred and watched the Stark girl.
After losing interest the cute furry cat purred and meowed.
Sly yet silent, the cute kitten meowed and she noticed.
The loud furry dog ran and bit at air.


In [305]:
# For each token, print corresponding part of speech tag
for token in docs[0:15]:
    print('{} - {}'.format(token, token.pos_))

The - DET
cute - ADJ
kitten - NOUN
purred - VERB
and - CCONJ
watched - VERB
the - DET
Stark - PROPN
girl - NOUN
. - PUNCT
After - ADP
losing - VERB
interest - NOUN
the - DET
cute - ADJ


In [120]:
# Print all named entities with named entity types

for ent in docs.ents:
    print('{} - {}'.format(ent, ent.label_))

Stark - PERSON
Sly - PERSON


In [121]:
# For a given document, calculate similarity between 'cat', 'kitten', and 'dog'
kitten = docs[2]
cat = docs[16]
dog = docs[36]
yet = docs[22]
print(cat.similarity(kitten))
print(cat.similarity(dog))
print(kitten.similarity(dog))
print(yet.similarity(dog))

0.821555381691
0.801685591074
0.703533825482
0.280267313562


# Create Corpus and Dictionary w/ Gensim

In [124]:
sentences = [sentence.orth_ for sentence in docs.sents]
print("There were {} sentences found. Here's a sample:".format(len(sentences)))
pd.DataFrame(sentences[0:5])
sentences

There were 4 sentences found. Here's a sample:


['The cute kitten purred and watched the Stark girl.',
 'After losing interest the cute furry cat purred and meowed.',
 'Sly yet silent, the cute kitten meowed and she noticed.',
 'The loud furry dog ran and bit at air.']

## Tokenize Each Word by Sentence

In [125]:
# Tokenizing Each Word
texts, article = [], []
for w in docs:
    # if it's not a stop word or punctuation mark, add it to our article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
        print(article)
        #texts.append(article)
    # if it's a new line, it means we're onto our next document
    if w.tag_ == '.' :
        texts.append(article)
        #print(article) #Print out each word to test
        article = []


['cute']
['cute', 'kitten']
['cute', 'kitten', 'purr']
['cute', 'kitten', 'purr', 'watch']
['cute', 'kitten', 'purr', 'watch', 'stark']
['cute', 'kitten', 'purr', 'watch', 'stark', 'girl']
['lose']
['lose', 'interest']
['lose', 'interest', 'cute']
['lose', 'interest', 'cute', 'furry']
['lose', 'interest', 'cute', 'furry', 'cat']
['lose', 'interest', 'cute', 'furry', 'cat', 'purr']
['lose', 'interest', 'cute', 'furry', 'cat', 'purr', 'meow']
['sly']
['sly', 'silent']
['sly', 'silent', 'cute']
['sly', 'silent', 'cute', 'kitten']
['sly', 'silent', 'cute', 'kitten', 'meow']
['sly', 'silent', 'cute', 'kitten', 'meow', 'notice']
['loud']
['loud', 'furry']
['loud', 'furry', 'dog']
['loud', 'furry', 'dog', 'run']
['loud', 'furry', 'dog', 'run', 'bit']
['loud', 'furry', 'dog', 'run', 'bit', 'air']


In [126]:
texts

[['cute', 'kitten', 'purr', 'watch', 'stark', 'girl'],
 ['lose', 'interest', 'cute', 'furry', 'cat', 'purr', 'meow'],
 ['sly', 'silent', 'cute', 'kitten', 'meow', 'notice'],
 ['loud', 'furry', 'dog', 'run', 'bit', 'air']]

In [129]:
#Combine all lines into one list
#for sentances in document for sentences 
texts_single = [item for sublist in texts for item in sublist]

#This list comprehension is doing the same as:
#for sublist in texts:
#    for item in sublist:
#        flat_list.append(item)

In [303]:
texts_single[0:5]

['cute', 'kitten', 'purr', 'watch', 'stark']

## Create Dictionary

In [132]:
#Make Dictionary 
dictionary = corpora.Dictionary(texts)
#dictionary.save(('dovel1.dict'))  # store the dictionary, for future reference
print(dictionary.token2id)
print(dictionary)

{'cute': 0, 'kitten': 1, 'purr': 2, 'watch': 3, 'stark': 4, 'girl': 5, 'lose': 6, 'interest': 7, 'furry': 8, 'cat': 9, 'meow': 10, 'sly': 11, 'silent': 12, 'notice': 13, 'loud': 14, 'dog': 15, 'run': 16, 'bit': 17, 'air': 18}
Dictionary(19 unique tokens: ['cute', 'kitten', 'purr', 'watch', 'stark']...)


## Create Matrix Corpus

In [159]:
#Create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
#corpora.MmCorpus.serialize('dovel1.mm', corpus)  # store to disk, for later use
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(0, 1), (2, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(0, 1), (1, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(8, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]]

# Transformations

## TFIDF 

In [180]:
#initilize
tfidf = models.TfidfModel(corpus)
tfidfN = models.TfidfModel(corpus, normalize=False)
print(tfidf)

TfidfModel(num_docs=4, num_nnz=25)


### Transform each Sentence

In [181]:
s1 = corpus[0]
s2 = corpus[1]
s3 = corpus[2]
s4 = corpus[3]

tf1=tfidf[s1]
tf2=tfidf[s2]
tf3=tfidf[s3]
tf4=tfidf[s4]

tf1N=tfidfN[s1]
tf2N=tfidfN[s2]
tf3N=tfidfN[s3]
tf4N=tfidfN[s4]


In [309]:
print(tf1)
print(tf2)
print(tf1N)
print(tf2N)

[(0, 0.11024726933725056), (1, 0.2656320682560318), (2, 0.2656320682560318), (3, 0.5312641365120636), (4, 0.5312641365120636), (5, 0.5312641365120636)]
[(0, 0.10655215922886847), (2, 0.25672899295608276), (6, 0.5134579859121655), (7, 0.5134579859121655), (8, 0.25672899295608276), (9, 0.5134579859121655), (10, 0.25672899295608276)]
[(0, 0.4150374992788437), (1, 1.0), (2, 1.0), (3, 2.0), (4, 2.0), (5, 2.0)]
[(0, 0.4150374992788437), (2, 1.0), (6, 2.0), (7, 2.0), (8, 1.0), (9, 2.0), (10, 1.0)]


## Word 2 Vec

In [355]:
#In spaCy there is functionality to create the word2vec model at the word
    #"The default English model installs vectors for one million vocabulary entries, 
    #using the 300-dimensional vectors trained on the Common Crawl corpus using the GloVe algorithm. 
    #The GloVe common crawl vectors have become a de facto standard for practical NLP."
#https://spacy.io/docs/usage/word-vectors-similarities

print('Spacy Token: ', docs[1])
print('Vector Size: ', len(cute_vector))
cute_vector = docs[1].vector
print(cute_vector[0:50], '...')

print()

print('Spacy Token: ', docs[20])
print('Vector Size: ', len(cute_vector))
period_vector = docs[20].vector
print(period_vector[0:50], '...')


Spacy Token:  cute
Vector Size:  300
[-0.35642001 -0.12153    -0.60569    -0.062242   -0.12673    -0.02612
 -0.058334   -0.59749001  0.080791    1.00230002 -0.30669999 -0.49897999
 -0.16244    -0.31716001 -0.38573     0.03942    -0.26468     1.22529995
  0.19279     0.005312   -0.12395    -0.30101001 -0.17156    -0.42899001
  0.034108    0.43832999  0.18667001 -0.73627001  0.25948     0.031607
 -0.39974001 -0.16317999  0.17473     0.33381999  0.24716    -0.57972997
 -0.020651   -0.041078   -0.49728999 -0.10925    -0.43551001 -0.021357
 -0.13062    -0.21269999  0.35229999 -0.25628999 -0.62704998  0.073671
  0.26864001 -0.45034   ] ...

Spacy Token:  .
Vector Size:  300
[ 0.012001    0.20750999 -0.12578    -0.59324998  0.12525     0.15975
  0.13748001 -0.33157    -0.13694     1.78929996 -0.47093999  0.70433998
  0.26673001 -0.089961   -0.18167999  0.067226    0.053347    1.55949998
 -0.25409999  0.038413   -0.01409     0.056774    0.023434    0.024042
  0.31703001  0.19024999 -0.37505001

In [367]:
s4_spcy = docs[33:43]
s4_spcy

The loud furry dog ran and bit at air.

In [368]:
s1_spcy = docs[0:10]
s1_vect = s1_spcy.vector
s2_spcy = docs[10:21]
s2_vect = s2_spcy.vector
s3_spcy = docs[21:33]
s3_vect = s3_spcy.vector
s4_spcy = docs[33:43]
s4_vect = s4_spcy.vector

print(s1_spcy)
print('Vector Length of ', len(sen1_vector))
print(sen1_vector[0:50], '...')

print()

print(s2_spcy)
print('Vector Length of ', len(sen2_vector))
print(sen2_vector[0:50], '...')


The cute kitten purred and watched the Stark girl.
Vector Length of  300
[-0.0394899  -0.0187946  -0.2313281  -0.07501449 -0.0034923  -0.03543136
 -0.08365459 -0.1366701   0.07988711  1.59997201 -0.22350159 -0.01495151
  0.01841    -0.1657521  -0.14396301  0.07315359 -0.0264913   0.67474192
 -0.152256   -0.06521721 -0.04318    -0.16186132 -0.12671219 -0.10123821
  0.0410289   0.0970646  -0.112487   -0.14183018  0.01335313 -0.04647293
 -0.14982264  0.0205042  -0.0090121   0.168327    0.182576   -0.17904079
  0.0069503  -0.05956352 -0.15230086 -0.0496251   0.01015183  0.0113903
  0.00332989 -0.16249254  0.0382682  -0.05392995 -0.27333999  0.06541368
  0.0911456  -0.227585  ] ...

After losing interest the cute furry cat purred and meowed.
Vector Length of  300
[-0.07320991  0.11364457 -0.18259937 -0.04985264 -0.018801   -0.03465935
 -0.037993   -0.12700583  0.02468536  1.65511811 -0.21201697  0.05384245
 -0.07783545 -0.07012409 -0.05808636 -0.06650855 -0.05374527  0.72631359
 -0.14617108

In [329]:
#Same can be done at the document level
print(docs)
docs_vector = docs.vector
print(docs_vector[0:50])
print(len(docs_vector))

The cute kitten purred and watched the Stark girl. After losing interest the cute furry cat purred and meowed. Sly yet silent, the cute kitten meowed and she noticed. The loud furry dog ran and bit at air.
[ -2.96052452e-02   1.07662223e-01  -1.49132133e-01  -7.46969581e-02
   1.32708149e-02   3.17049772e-02  -1.09451404e-02  -1.61642089e-01
   2.65648309e-02   1.74241495e+00  -1.46555364e-01   5.43590356e-03
  -2.56856922e-02  -1.58875853e-01  -1.62996352e-01   2.48137470e-02
  -3.87500040e-02   7.81989813e-01  -2.00948134e-01  -7.90675804e-02
  -5.39194196e-02  -6.40327632e-02  -5.70502989e-02  -1.32878065e-01
   5.21605499e-02   5.05217910e-02  -1.22933485e-01  -1.20797276e-01
   7.05222115e-02  -8.14056993e-02  -1.35169148e-01   1.08062522e-03
  -5.00328802e-02   1.31003201e-01   1.24342784e-01  -1.46289453e-01
   1.29621802e-03  -3.99797671e-02  -1.01229399e-01  -3.95646989e-02
   5.10285832e-02   2.97673959e-02   1.50560727e-02  -1.08856298e-01
   9.05850902e-02  -1.38813816e-02 

In [260]:
import multiprocessing
import gensim.models.word2vec as w2v


In [261]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Rank - find most relevant documents relating to a topic

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 2

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

# select 1 for Skip-Gram Model, 0 for Continuous Bag of Words

sg = 1 

In [262]:
#Initilize Model
doc2v = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)


In [264]:
doc2v.build_vocab(sentences)

RuntimeError: cannot sort vocabulary after model weights already initialized.

In [270]:
token_count = len(doc2v.wv.vocab)
print("Word2Vec vocabulary length:", len(doc2v.wv.vocab))

Word2Vec vocabulary length: 24


In [271]:
doc2v.train(sentences, total_examples = token_count, epochs = doc2v.iter)

156

In [277]:
# vector weight matrix
all_word_vectors_matrix = doc2v.wv.syn0
print(all_word_vectors_matrix.shape)
all_word_vectors_matrix[0]

(24, 300)


array([  3.52896168e-04,   1.10605743e-03,  -1.51454855e-03,
        -1.57380666e-04,  -6.06522080e-04,  -6.79342658e-04,
         1.68168708e-03,  -1.05058437e-03,  -4.99991351e-04,
        -1.26442651e-03,  -1.08079088e-03,   1.12917065e-03,
         5.85765301e-05,  -1.93516654e-03,  -1.48352329e-03,
        -4.94567386e-04,   5.51221747e-05,  -7.83927622e-04,
        -1.09044416e-03,   1.43233268e-03,   1.22664811e-03,
         1.83905271e-04,  -1.56069372e-03,  -1.30228244e-03,
         4.17706207e-04,   2.78462161e-04,   7.54722394e-04,
        -1.28461979e-03,   1.32872094e-03,  -1.20788650e-03,
        -6.61480590e-04,  -1.26445142e-03,   1.69577776e-03,
         1.26981537e-03,   3.57229641e-04,  -1.30495685e-03,
        -4.09013533e-04,   1.23142847e-03,  -1.15701626e-03,
        -1.69484731e-04,   3.67263099e-04,   1.52529997e-03,
         1.25181305e-05,   1.46791659e-04,   9.38934332e-04,
         1.48172630e-03,  -1.07844837e-03,  -3.50329618e-04,
         7.66346697e-04,

In [248]:
texts_raw = []
for token in docs:
    #print(token)
    texts_raw.append(token)
texts_raw[0:5]

[The, cute, kitten, purred, and]

In [246]:
#Push Sentences through it
w2v2.build_vocab(sentences)

In [258]:
?build_vocab()

Object `build_vocab` not found.


In [231]:
print(texts_single)

['cute', 'kitten', 'purr', 'watch', 'stark', 'girl', 'lose', 'interest', 'cute', 'furry', 'cat', 'purr', 'meow', 'sly', 'silent', 'cute', 'kitten', 'meow', 'notice', 'loud', 'furry', 'dog', 'run', 'bit', 'air']


In [252]:
wv1=w2v2[s1]
#tf2=tfidf[s2]
#tf3=tfidf[s3]
#tf4=tfidf[s4]

#tf1N=tfidfN[s1]
#tf2N=tfidfN[s2]
#tf3N=tfidfN[s3]
#tf4N=tfidfN[s4]

TypeError: not all arguments converted during string formatting

# Puting it all together

In [185]:
cdf = pd.DataFrame()

## Add Raw Text

In [186]:
cdf['Sentence'] = [raw1, raw2, raw3, raw4]

## Add Tokenization Form

In [187]:
cdf['Tokens'] = [texts[0], texts[1], texts[2],texts[3]]

## Add Corpus Matrix

In [188]:
cdf['Corpus'] = [s1, s2, s3, s4]

## Add TFIDF Vectorization

In [189]:
cdf['TFIDF'] = [tf1, tf2, tf3, tf4]

In [192]:
cdf['TFIDF Not Normalized'] = [tf1N, tf2N, tf3N, tf4N]

## Add Sen2Vec Vectorization

In [371]:
cdf['Sen2Vec'] = [s1_vect, s2_vect, s3_vect, s4_vect]

In [372]:
cdf

Unnamed: 0,Sentence,Tokens,Corpus,TFIDF,TFIDF Not Normalized,Sen2Vec
0,The cute kitten purred and watched the Stark g...,"[cute, kitten, purr, watch, stark, girl]","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]","[(0, 0.11024726933725056), (1, 0.2656320682560...","[(0, 0.4150374992788437), (1, 1.0), (2, 1.0), ...","[-0.0394899, -0.0187946, -0.231328, -0.0750145..."
1,After losing interest the cute furry cat purr...,"[lose, interest, cute, furry, cat, purr, meow]","[(0, 1), (2, 1), (6, 1), (7, 1), (8, 1), (9, 1...","[(0, 0.10655215922886847), (2, 0.2567289929560...","[(0, 0.4150374992788437), (2, 1.0), (6, 2.0), ...","[-0.0732099, 0.113645, -0.182599, -0.0498526, ..."
2,"Sly yet silent, the cute kitten meowed and sh...","[sly, silent, cute, kitten, meow, notice]","[(0, 1), (1, 1), (10, 1), (11, 1), (12, 1), (1...","[(0, 0.11024726933725056), (1, 0.2656320682560...","[(0, 0.4150374992788437), (1, 1.0), (10, 1.0),...","[0.00436454, 0.100187, -0.162217, -0.0844011, ..."
3,The loud furry dog ran and bit at air.,"[loud, furry, dog, run, bit, air]","[(8, 1), (14, 1), (15, 1), (16, 1), (17, 1), (...","[(8, 0.2182178902359924), (14, 0.4364357804719...","[(8, 1.0), (14, 2.0), (15, 2.0), (16, 2.0), (1...","[-0.0125192, 0.236509, -0.0144205, -0.0900632,..."


In [None]:
#Testing
#From https://radimrehurek.com/gensim/models/tfidfmodel.html
    #weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
    = 

# 'loud' in row 3 TF = 1; D = 4; DF = 1  -> TF/DF = 1 * log_2(4/1)
# 'cute' in row 0 sentance TF = 1 & DF = 3