In [1]:
""" Article Spinner
We are going to build a probability model for each word in Amazon reviews, conditioned on its 2 neighbours (context).
Thus it is called the trigram model.
Then we are going to use this probability model to substitute some words in the article with others that
have appeared in the same context. E.g. say 'Dog chase Cat' and 'Dog lick cat' are both appearing in the reviews
then 'lick' can be substituted with 'chase'
"""

" Article Spinner\nWe are going to build a probability model for each word in Amazon reviews, conditioned on its 2 neighbours (context).\nThus it is called the trigram model.\nThen we are going to use this probability model to substitute some words in the article with others that\nhave appeared in the same context. E.g. say 'Dog chase Cat' and 'Dog lick cat' are both appearing in the reviews\nthen 'lick' can be substituted with 'chase'\n"

In [2]:
"""
Order of operations
1. Read in the Amazon reviews, and tokenize
2. Make the Context dictionary
3. Make the probability tables for each Trigram
4. Do the actual spinning
"""

'\nOrder of operations\n1. Read in the Amazon reviews, and tokenize\n2. Make the Context dictionary\n3. Make the probability tables for each Trigram\n4. Do the actual spinning\n'

In [3]:
import numpy as np
import nltk
from bs4 import BeautifulSoup
import pdb

In [4]:
""" No stop word removal , no stemming (for grammatical correctness) """
pos_review = BeautifulSoup(open('electronics/positive.review').read())
pos_review = pos_review.findAll('review_text')
tokenized_rev =[]
for r in pos_review:
    r = r.text
    r = r.lower()
    tokenized_rev.append(nltk.tokenize.word_tokenize(r))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [5]:
""" Make the trigram dictionary
the key is the context, and the value is the words. for e.g. if we have 'Dog chase cat' and 'dog lick cat'
then the dictionary will be {(dog,cat):[chase,lick]}

"""
trigrams = {} 
for r in tokenized_rev:
    len_r = len(r)
    for i in range(1,len_r-2): # we want the 'center' word to be last but one, 
        context=(r[i-1],r[i+1])# in python indexing this comes to len-1
        w = r[i]
        if context not in trigrams: # incase this context hasn't been seen before, make the dictionar key a 
            trigrams[context] = []  # empty list 
        trigrams[context].extend([w]) # extend the list with this word
        

In [6]:
""" see the trigram dictionary"""
i = 0
for k,v in trigrams.iteritems(): # you can't do 'for k,v in trigrams' , you have to use iteritems
    if len(v) > 1:
        print '(',k,':',trigrams[k],')',
        i += 1
    if i==10:
        break


( (u'i', u'running') : [u"'m", u'am', u"'m", u'was', u"'m"] ) ( (u'have', u'made') : [u'been', u'also'] ) ( (u'use', u'battery') : [u'the', u'the'] ) ( (u'want', u'you') : [u'everytime', u'optical'] ) ( (u'think', u'are') : [u'there', u'these'] ) ( (u'.', u'prefer') : [u'i', u'i'] ) ( (u'at', u'times') : [u'all', u'three', u'all', u'all'] ) ( (u"'", u'their') : [u'on', u'down'] ) ( (u'played', u'on') : [u'fine', u'once'] ) ( (u'--', u'i') : [u'if', u'-'] )


In [15]:
""" make the trigram model """
trigrams_model = {}
ctr = 0
for context,words in trigrams.iteritems():
    nwords_in_context = len(words)
    if nwords_in_context > 1:
        word_probs = {}
        for w in words:
            if w not in word_probs:
                word_probs[w] = 0.
            word_probs[w] += 1./nwords_in_context
        trigrams_model[context] = word_probs
        #break


In [21]:
""" see the trigrams model"""
i = 0
for k,v in trigrams_model.iteritems(): # you can't do 'for k,v in trigrams' , you have to use iteritems
    print '(',k,':',trigrams_model[k],')\n'
    i += 1
    if i==10:
        break


( (u',', u'which') : {u"''": 0.3333333333333333, u'and': 0.3333333333333333, u'(': 0.3333333333333333} )

( (u'to', u'i') : {u'wherever': 0.09090909090909091, u',': 0.09090909090909091, u'bed': 0.09090909090909091, u'.': 0.2727272727272727, u'say': 0.18181818181818182, u'but': 0.09090909090909091, u'which': 0.09090909090909091, u'it': 0.09090909090909091} )

( (u'in', u'usb') : {u'the': 0.6666666666666666, u'another': 0.3333333333333333} )

( (u'you', u'have') : {u'wont': 0.037037037037037035, u'do': 0.037037037037037035, u"'ll": 0.2222222222222222, u'would': 0.037037037037037035, u'to': 0.037037037037037035, u'may': 0.07407407407407407, u'always': 0.037037037037037035, u'never': 0.037037037037037035, u'should': 0.037037037037037035, u'will': 0.1111111111111111, u'also': 0.037037037037037035, u'only': 0.037037037037037035, u'definitely': 0.037037037037037035, u'can': 0.037037037037037035, u'now': 0.037037037037037035, u'might': 0.1111111111111111, u'must': 0.037037037037037035} )

( (u

In [85]:
def spin(context):
    r = np.random.random() # get a random number between 0-1
    try:
        word_probs = trigrams_model[context]
        cumsum = 0.
        for w in word_probs:
            cumsum += word_probs[w]
            if r<cumsum:
                return w
    except:
        return None
print trigrams_model.keys()[1],spin(trigrams_model.keys()[1])

(u'to', u'i') say


In [92]:
"""
keep a probability of spinning, say 0.2
"""
pspin = 0.8
rev = tokenized_rev[0]
spun = [rev[0]]

for i in range(1,len(rev)-1):
    spin_flag = np.random.random()< pspin
    if spin_flag:
        w = spin((rev[i-1],rev[i+1]))
        if w is None:
            w = rev[i]
    else:
        w = rev[i]
    spun.extend([w])
print 'Original\n',' '.join(rev)
print '\nSpun\n',' '.join(spun)
        

Original
i purchased this unit due to frequent blackouts in my area and 2 power supplies going bad . it will run my cable modem , router , pc , and lcd monitor for 5 minutes . this is more than enough time to save work and shut down . equally important , i know that my electronics are receiving clean power . i feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply . as always , amazon had it to me in < 2 business days

Spun
i bought this unit prior to frequent blackouts in my area and crank power supplies going strong . i will run my wireless box , router , pc , and the monitor for 5 minutes . this gets higher than expected storage to save work and cools down . equally important , i feel if my electronics are receiving clean power . i found . this case is minor compared to the speakers of valuable data or the back of equipment attached to dedicated power spike or an irregular power outlet . 