In [1]:
#Turn on Logging
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim import corpora, models, similarities
import pandas as pd
import os
os.getcwd()
#os.chdir()

Using TensorFlow backend.
2017-09-08 09:17:16,503 : INFO : 'pattern' package not found; tag filters are not available for English


'/Users/whs/Documents/Fun With ML/Gensim Tutorials'

# Import Raw Text

In [3]:
#Read in 'documents'

raw1 = 'The cute kitten purred and watched the Stark girl.'
raw2 = ' After losing interest the cute furry cat purred and meowed.'
raw3 = ' Sly yet silent, the cute kitten meowed and she noticed.' 
raw4 = ' The loud furry dog ran and bit at air.'

raw_text = str(raw1 + raw2 + raw3 + raw4)
raw_text
raw_text2 = str(raw1 + raw4)

# Process text w/ SpaCy

## Initialize 'en' tokenization pipeline

In [4]:
import spacy
from spacy.en import English
nlp = spacy.load("en")

## Push all documents through it

In [5]:
docs = nlp(raw_text, parse=True)
docs2 = nlp(raw_text2, parse=True)
docs

The cute kitten purred and watched the Stark girl. After losing interest the cute furry cat purred and meowed. Sly yet silent, the cute kitten meowed and she noticed. The loud furry dog ran and bit at air.

## Interesting Text Analysis Stuff

In [6]:
#Now with our text all spacy tokened, we can do cool stuff
#https://github.com/cytora/pycon-nlp-in-10-lines

# Get first token of the processed document
token = docs[0]
print(token)

# Print sentences (one sentence per line)
for sent in docs.sents:
    print(sent)


The
The cute kitten purred and watched the Stark girl.
After losing interest the cute furry cat purred and meowed.
Sly yet silent, the cute kitten meowed and she noticed.
The loud furry dog ran and bit at air.


In [7]:
# For each token, print corresponding part of speech tag
for token in docs[0:15]:
    print('{} - {}'.format(token, token.pos_))

The - DET
cute - ADJ
kitten - NOUN
purred - VERB
and - CCONJ
watched - VERB
the - DET
Stark - PROPN
girl - NOUN
. - PUNCT
After - ADP
losing - VERB
interest - NOUN
the - DET
cute - ADJ


In [8]:
# Print all named entities with named entity types

for ent in docs.ents:
    print('{} - {}'.format(ent, ent.label_))

Stark - PERSON
Sly - PERSON


In [9]:
# For a given document, calculate similarity between 'cat', 'kitten', and 'dog'
kitten = docs[2]
cat = docs[16]
dog = docs[36]
yet = docs[22]
print(cat.similarity(kitten))
print(cat.similarity(dog))
print(kitten.similarity(dog))
print(yet.similarity(dog))

0.821555381691
0.801685591074
0.703533825482
0.280267313562


# Create Corpus and Dictionary w/ Gensim

In [10]:
sentences = [sentence.orth_ for sentence in docs.sents]
print("There were {} sentences found. Here's a sample:".format(len(sentences)))
pd.DataFrame(sentences[0:5])
sentences

There were 4 sentences found. Here's a sample:


['The cute kitten purred and watched the Stark girl.',
 'After losing interest the cute furry cat purred and meowed.',
 'Sly yet silent, the cute kitten meowed and she noticed.',
 'The loud furry dog ran and bit at air.']

## Tokenize Each Word by Sentence

In [11]:
# Tokenizing Each Word
texts, article = [], []
for w in docs:
    # if it's not a stop word or punctuation mark, add it to our article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
        print(article)
        #texts.append(article)
    # if it's a new line, it means we're onto our next document
    if w.tag_ == '.' :
        texts.append(article)
        #print(article) #Print out each word to test
        article = []


['cute']
['cute', 'kitten']
['cute', 'kitten', 'purr']
['cute', 'kitten', 'purr', 'watch']
['cute', 'kitten', 'purr', 'watch', 'stark']
['cute', 'kitten', 'purr', 'watch', 'stark', 'girl']
['lose']
['lose', 'interest']
['lose', 'interest', 'cute']
['lose', 'interest', 'cute', 'furry']
['lose', 'interest', 'cute', 'furry', 'cat']
['lose', 'interest', 'cute', 'furry', 'cat', 'purr']
['lose', 'interest', 'cute', 'furry', 'cat', 'purr', 'meow']
['sly']
['sly', 'silent']
['sly', 'silent', 'cute']
['sly', 'silent', 'cute', 'kitten']
['sly', 'silent', 'cute', 'kitten', 'meow']
['sly', 'silent', 'cute', 'kitten', 'meow', 'notice']
['loud']
['loud', 'furry']
['loud', 'furry', 'dog']
['loud', 'furry', 'dog', 'run']
['loud', 'furry', 'dog', 'run', 'bit']
['loud', 'furry', 'dog', 'run', 'bit', 'air']


In [12]:
texts

[['cute', 'kitten', 'purr', 'watch', 'stark', 'girl'],
 ['lose', 'interest', 'cute', 'furry', 'cat', 'purr', 'meow'],
 ['sly', 'silent', 'cute', 'kitten', 'meow', 'notice'],
 ['loud', 'furry', 'dog', 'run', 'bit', 'air']]

In [13]:
#Combine all lines into one list
#for sentances in document for sentences 
texts_single = [item for sublist in texts for item in sublist]

#This list comprehension is doing the same as:
#for sublist in texts:
#    for item in sublist:
#        flat_list.append(item)

In [14]:
texts_single[0:5]

['cute', 'kitten', 'purr', 'watch', 'stark']

## Create Dictionary

In [15]:
#Make Dictionary 
dictionary = corpora.Dictionary(texts)
#dictionary.save(('dovel1.dict'))  # store the dictionary, for future reference
print(dictionary.token2id)
print(dictionary)

2017-09-08 09:17:21,761 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-09-08 09:17:21,763 : INFO : built Dictionary(19 unique tokens: ['cute', 'kitten', 'purr', 'watch', 'stark']...) from 4 documents (total 25 corpus positions)


{'cute': 0, 'kitten': 1, 'purr': 2, 'watch': 3, 'stark': 4, 'girl': 5, 'lose': 6, 'interest': 7, 'furry': 8, 'cat': 9, 'meow': 10, 'sly': 11, 'silent': 12, 'notice': 13, 'loud': 14, 'dog': 15, 'run': 16, 'bit': 17, 'air': 18}
Dictionary(19 unique tokens: ['cute', 'kitten', 'purr', 'watch', 'stark']...)


## Create Matrix Corpus

In [16]:
#Create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
#corpora.MmCorpus.serialize('dovel1.mm', corpus)  # store to disk, for later use
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(0, 1), (2, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(0, 1), (1, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(8, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]]

# TFIDF Transformation

## Initialize Model

In [17]:
#initilize
tfidf = models.TfidfModel(corpus)
tfidfN = models.TfidfModel(corpus, normalize=False)
print(tfidf)

2017-09-08 09:17:21,788 : INFO : collecting document frequencies
2017-09-08 09:17:21,790 : INFO : PROGRESS: processing document #0
2017-09-08 09:17:21,793 : INFO : calculating IDF weights for 4 documents and 18 features (25 matrix non-zeros)
2017-09-08 09:17:21,794 : INFO : collecting document frequencies
2017-09-08 09:17:21,796 : INFO : PROGRESS: processing document #0
2017-09-08 09:17:21,797 : INFO : calculating IDF weights for 4 documents and 18 features (25 matrix non-zeros)


TfidfModel(num_docs=4, num_nnz=25)


## Transform each Sentence

In [18]:
s1 = corpus[0]
s2 = corpus[1]
s3 = corpus[2]
s4 = corpus[3]

tf1=tfidf[s1]
tf2=tfidf[s2]
tf3=tfidf[s3]
tf4=tfidf[s4]

tf1N=tfidfN[s1]
tf2N=tfidfN[s2]
tf3N=tfidfN[s3]
tf4N=tfidfN[s4]


## Vectorized Output

In [19]:
print(tf1)
print(tf2)
print(tf1N)
print(tf2N)

[(0, 0.11024726933725056), (1, 0.2656320682560318), (2, 0.2656320682560318), (3, 0.5312641365120636), (4, 0.5312641365120636), (5, 0.5312641365120636)]
[(0, 0.10655215922886847), (2, 0.25672899295608276), (6, 0.5134579859121655), (7, 0.5134579859121655), (8, 0.25672899295608276), (9, 0.5134579859121655), (10, 0.25672899295608276)]
[(0, 0.4150374992788437), (1, 1.0), (2, 1.0), (3, 2.0), (4, 2.0), (5, 2.0)]
[(0, 0.4150374992788437), (2, 1.0), (6, 2.0), (7, 2.0), (8, 1.0), (9, 2.0), (10, 1.0)]


# Puting it all together

In [20]:
cdf = pd.DataFrame()

## Add Raw Text

In [21]:
cdf['Sentence'] = [raw1, raw2, raw3, raw4]

## Add Tokenization Form

In [22]:
cdf['Tokens'] = [texts[0], texts[1], texts[2],texts[3]]

## Add Corpus Matrix

In [23]:
cdf['Corpus'] = [s1, s2, s3, s4]

## Add TFIDF Vectorization

In [24]:
cdf['TFIDF'] = [tf1, tf2, tf3, tf4]

In [25]:
cdf['TFIDF Not Normalized'] = [tf1N, tf2N, tf3N, tf4N]

In [26]:
cdf

Unnamed: 0,Sentence,Tokens,Corpus,TFIDF,TFIDF Not Normalized
0,The cute kitten purred and watched the Stark g...,"[cute, kitten, purr, watch, stark, girl]","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]","[(0, 0.11024726933725056), (1, 0.2656320682560...","[(0, 0.4150374992788437), (1, 1.0), (2, 1.0), ..."
1,After losing interest the cute furry cat purr...,"[lose, interest, cute, furry, cat, purr, meow]","[(0, 1), (2, 1), (6, 1), (7, 1), (8, 1), (9, 1...","[(0, 0.10655215922886847), (2, 0.2567289929560...","[(0, 0.4150374992788437), (2, 1.0), (6, 2.0), ..."
2,"Sly yet silent, the cute kitten meowed and sh...","[sly, silent, cute, kitten, meow, notice]","[(0, 1), (1, 1), (10, 1), (11, 1), (12, 1), (1...","[(0, 0.11024726933725056), (1, 0.2656320682560...","[(0, 0.4150374992788437), (1, 1.0), (10, 1.0),..."
3,The loud furry dog ran and bit at air.,"[loud, furry, dog, run, bit, air]","[(8, 1), (14, 1), (15, 1), (16, 1), (17, 1), (...","[(8, 0.2182178902359924), (14, 0.4364357804719...","[(8, 1.0), (14, 2.0), (15, 2.0), (16, 2.0), (1..."


In [27]:
#Testing
#From https://radimrehurek.com/gensim/models/tfidfmodel.html
    #weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
    = 

# 'loud' in row 3 TF = 1; D = 4; DF = 1  -> TF/DF = 1 * log_2(4/1)
# 'cute' in row 0 sentance TF = 1 & DF = 3

IndentationError: unexpected indent (<ipython-input-27-daea73d812e5>, line 4)