In [2]:
import spacy
nlp = spacy.load("en")

In [3]:
filename = "reviews.txt"
document = unicode(open(filename).read().decode('utf8'))
document = nlp(document)

In [58]:
[x for x in dir(document) if "_"not in x][:3] ## Just few of the properties 

['doc', 'ents', 'mem']

In [59]:
# first token of the doc 
print document[0] 

# # last token of the doc 
print document[len(document)-5]


Nice


In [60]:
# List of sentences of our doc
list(document.sents)[:3]

[Nice place Better than some reviews give it credit for.,
 Overall, the rooms were a bit small but nice.,
 Everything was clean, the view was wonderful and it is very well located (the Prudential Center makes shopping and eating easy and the T is nearby for jaunts out and about the city).]

In [61]:
# all tags
all_tags = {w.pos: w.pos_ for w in document}
print all_tags 

# all tags of first sentence of our document
for word in list(document.sents)[0]: 
    print (word, word.tag_),

{97: u'SYM', 98: u'VERB', 99: u'X', 101: u'SPACE', 82: u'ADJ', 83: u'ADP', 84: u'ADV', 87: u'CCONJ', 88: u'DET', 89: u'INTJ', 90: u'NOUN', 91: u'NUM', 92: u'PART', 93: u'PRON', 94: u'PROPN', 95: u'PUNCT'}
(Nice, u'JJ') (place, u'NN') (Better, u'NNP') (than, u'IN') (some, u'DT') (reviews, u'NNS') (give, u'VBP') (it, u'PRP') (credit, u'NN') (for, u'IN') (., u'.')


In [29]:
# define some parameters 
noisy_pos_tags = ["PROP"]
min_token_length = 2

# Function to check if the token is a noise or not 
def isNoise(token):
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
    elif token.is_stop == True:
        is_noise = True 
    elif len(token.string) <= min_token_length:
        is_noise = True 
    return is_noise

def cleanup(token, lower = True):
    if lower:
        token = token.lower()
    return token.strip()

# top unigrams used in the reviews
from collections import Counter	
cleaned_list = [cleanup(word.string) for word in document if not isNoise(word)]
Counter(cleaned_list).most_common(5)

[(u'hotel', 683),
 (u'room', 652),
 (u'great', 300),
 (u'sheraton', 285),
 (u'location', 271)]

In [35]:
# Entity Detection
labels = set([w.label_ for w in document.ents])
for label in labels:
    entities = [cleanup(e.string, lower=False) for e in document.ents if label==e.label_]
    entities = list(set(entities))
    print label, entities

ORDINAL [u'29th', u'Thoroughly', u'3yo', u'2nd', u'second', u'28th', u'1st', u'11th', u'fifth', u'North', u'3rd', u'27th', u'5th', u'14th', u'19th', u'25th', u'15th', u'first', u'50th', u'sixth', u'26th', u'4th', u'3pm', u'Firstly', u'last', u'24th', u'10-month', u'First', u'Secondly', u'15-minute', u'17th', u'22nd', u'third', u'6th', u'Second', u'18th', u'9th', u'10th', u'16th', u'20th', u'8th']
LOC [u'', u'North Tower', u'Europe', u'Long Island Iced Tea', u'North', u'New England', u'WiFi', u'Fenway Park', u'New England Aquarium', u'Back Bay', u'the Charles River', u'the South Wing', u'Last', u'mid September', u'the South Tower', u'South Tower', u'the Back Bay Fens', u'the Back Bay', u'St Charles River', u'the Turning Point Lounge', u'Charles River', u'the South End', u'St Charles', u'hotels\u9225?web']
PRODUCT [u'Cheescake', u'Great location', u'Centre', u'225.00', u'Concierge', u'Motel 6', u'Suite', u'3.30pm', u'Radisson']
LAW [u'Room 2916', u'e.g.', u'the Duck Tour - it', u'the She

In [37]:
# extract all review sentences that contains the term - hotel
hotel = [sent for sent in document.sents if 'hotel' in sent.string.lower()]

# create dependency tree
sentence = hotel[2]
for word in sentence: 
    print (word, ': ', str(list(word.children)) 

A :  []
cab :  [A, from]
from :  [airport, to]
the :  []
airport :  [the]
to :  [hotel]
the :  []
hotel :  [the]
can :  []
be :  [cab, can, cheaper, .]
cheaper :  [than]
than :  [shuttles]
the :  []
shuttles :  [the, depending]
depending :  [time]
what :  []
time :  [what, of]
of :  [day]
the :  []
day :  [the, go]
you :  []
go :  [you]
. :  []


In [62]:
# check all adjectives used with a word
def pos_words (sentence, token, ptag):
    sentences = [sent for sent in sentence.sents if token in sent.string]
    pwrds = []
    for sent in sentences: 
        for word in sent: 
            if token in word.string:
                pwrds.extend([child.string.strip() for child in word.children if child.pos_ == ptag])
    return Counter(pwrds).most_common(10)

print pos_words(document, 'hotel', "ADJ")

[(u'other', 20), (u'great', 10), (u'good', 7), (u'better', 6), (u'nice', 6), (u'different', 5), (u'many', 5), (u'best', 4), (u'my', 4), (u'wonderful', 3)]


In [50]:
# Generate Noun Phrases
doc = nlp(u'I love data science on analytics vidhya')
for np in doc.noun_chunks:
    print np.text, np.root.dep_, np.root.head.text

I nsubj love
data science dobj love
analytics pobj on


In [63]:
# Word Vectors 
from spacy.en import English
parser = English()

from numpy import dot
from numpy.linalg import norm

# Generate word vector of the word - apple 
apple = parser.vocab[u'apple']

# Cosine similarity function
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

others = list({w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != unicode("apple")})

# sort by similarity score
others.sort(key=lambda w: cosine(w.vector, apple.vector))
others.reverse()

print "top most similar words to apple:"
for word in others[:10]:   
    print word.orth_,

top most similar words to apple:
apples iphone fruit juice cherry lemon banana pie mac orange
