In [7]:
import pandas as pd
import spacy 
from spacy import displacy
import nltk


# Get Data from JSON File

In [8]:
reviews = pd.read_json('data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)
reviews.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


# Get 50 Random 1 Star Reviews (1 per business)

In [10]:
def getReviews(numReviews, numStars):
    """
    Takes in 2 arguments, the number of reviews and the number of stars
    Returns the specified number of reviews of the specified stars from unique businesses
    """
    ## get all ratings of numStars
    rating = reviews.loc[reviews['stars'] == numStars]
    ## get 1 review per business
    ratingUnique = rating.groupby('business_id').apply(lambda df: df.sample(1))
    ## select numReviews reviews
    randomState = 294
    ratingUniqueBis = ratingUnique.sample(numReviews, random_state = randomState) 
    return ratingUniqueBis

## get fifty one star reviews from unique businesses
rating1FiftyUniqueBis = getReviews(50, 1)
print(rating1FiftyUniqueBis.shape)

(50, 9)


# Separate text into sentences for analysis

In [11]:
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [8]:
## data preparation
def visualiseData(listOfReviews):
    """
    takes in list of reviews
    shows a visualisation of the reviews
    returns a list of sentences in the reviews
    """
    ## get a list of text from the reviews
    text = list(listOfReviews['text'])
    ## convert list to string
    stringText = ''.join(text)
    ## tokenize string
    doc = nlp(stringText)
    ## split string into sentences
    sentences = list(doc.sents)
    ## visualise data
    displacy.render(sentences, style="dep")
    return sentences

    
    
# ## get a list of text from the reviews
# text = list(rating1FiftyUniqueBis['text'])
# print(len(text))
# ## convert list to string
# stringText = ''.join(text)
# stringText

In [12]:
# ## tokenize string
# doc = nlp(stringText)
# ## split string into sentences
# analysisData1Star = list(doc.sents)
# ## visualise data 
# displacy.render(analysisData1Star, style="dep")

## visualise sentences in one star reviews
oneStarSentences = visualiseData(rating1FiftyUniqueBis)


# Find Noun Adjective pairs

In [10]:

nounAdjPairs1 = []

## function to get nounAdjPairs
def getNounAdjPairs(listOfSentences):
    """
    takes in list of sentences
    returns a list of noun adjective pairs
    """
    nounAdjPairs = []
    for sentence in listOfSentences:
        for word in sentence:
            ## type1: check for amod relationship and keep if word.head is a noun nouns only
            if word.dep_ == "amod" and word.head.pos_ == 'NOUN' :
                nounAdjPairs.append((word.head.text,word.text))
            elif word.dep_ == "conj" and word.head.dep_ == 'amod':
                nounAdjPairs.append((word.head.head.text, word.text))
            ## type 2
            elif word.dep_ == "acomp":
                for child in word.head.children:
                    if child.dep_=='nsubj':
                        nounAdjPairs.append((child.text, word.text))
            elif word.dep_ == "conj" and word.head.dep_ == 'acomp':
                for child in word.head.head.children:
                    if child.dep_ == 'nsubj':
                        nounAdjPairs.append((child.text, word.text))
    return nounAdjPairs

## get noun adjective pairs in one star reviews
nounAdjPairs1 = getNounAdjPairs(oneStarSentences)              

print(len(nounAdjPairs1), "pairs")
nounAdjPairs1

377 pairs


[('window', 'Smashed'),
 ('car', 'new'),
 ('experience', 'painful'),
 ('cat', 'lethargic'),
 ('dose', 'inadequate'),
 ('dose', 'second'),
 ('dose', 'painful'),
 ('signals', 'mixed'),
 ('infection', 'severe'),
 ('infection', 'upper'),
 ('infection', 'respiratory'),
 ('vet', 'usual'),
 ('it', 'pink'),
 ('it', 'mushy'),
 ('taste', 'terrible'),
 ('place', 'disturbing'),
 ('bowl', 'smaller'),
 ('piece', 'small'),
 ('fact', 'insulting'),
 ('service', 'good'),
 ('kid', 'online'),
 ('shop', 'full'),
 ('people', 'knowledgeable'),
 ('time', 'second'),
 ('service', 'horrible'),
 ('service', 'rude'),
 ('girl', 'nice'),
 ('girl', 'cashier'),
 ('She', 'mad'),
 ('ropes', 'fancy'),
 ('things', 'good'),
 ('that', 'true'),
 ('jumpsuits', 'tight'),
 ('we', 'underdressed'),
 ('clothes', 'skimpy'),
 ('expectations', 'high'),
 ('establishments', 'nice'),
 ('set', 'full'),
 ('job', 'awful'),
 ('time', 'whole'),
 ('it', 'bad'),
 ('tips', 'crooked'),
 ('servira', 'vous'),
 ('quelques', 'vous'),
 ('fromage', "d

In [11]:
## get most common noun adjective pairs for onestar reviews

from collections import Counter
c = Counter(nounAdjPairs1)
print (c.most_common(23))

[(('time', 'first'), 9), (('time', 'last'), 4), (('thing', 'same'), 3), (('buffet', 'Indian'), 3), (('car', 'new'), 2), (('service', 'rude'), 2), (('it', 'bad'), 2), (('restaurant', 'Chinese'), 2), (('style', 'dry'), 2), (('I', 'sure'), 2), (('food', 'good'), 2), (('experience', 'Terrible'), 2), (('chance', 'second'), 2), (('desk', 'front'), 2), (('week', 'last'), 2), (('pool', 'green'), 2), (('bag', 'plastic'), 2), (('window', 'Smashed'), 1), (('experience', 'painful'), 1), (('cat', 'lethargic'), 1), (('dose', 'inadequate'), 1), (('dose', 'second'), 1), (('dose', 'painful'), 1)]


## get noun adjective pairs for 2,3,4,5 star reviews

### 2 star reviews

In [12]:
## get twenty two star reviews from unique businesses
rating2TwentyUniqueBis = getReviews(20, 2)
## visualise sentences in two star reviews
twoStarSentences = visualiseData(rating2TwentyUniqueBis)

In [13]:
## get noun adjective pairs in two star reviews
nounAdjPairs2 = getNounAdjPairs(twoStarSentences)
## get most common noun adjective pairs for two star reviews
c = Counter(nounAdjPairs2)
print (c.most_common(10))

[(('time', 'last'), 2), (('night', 'last'), 2), (('I', 'impressed'), 2), (('place', 'dirty'), 2), (('time', 'first'), 2), (('places', 'other'), 2), (('restaurant', 'Turkish'), 2), (('they', 'defensive'), 2), (('office', 'front'), 2), (('I', 'excited'), 1)]


### 3 star reviews

In [14]:
## get twenty three star reviews from unique businesses
rating3TwentyUniqueBis = getReviews(20, 3)
## visualise sentences in three star reviews
threeStarSentences = visualiseData(rating3TwentyUniqueBis)


In [15]:
## get noun adjective pairs in three star reviews
nounAdjPairs3 = getNounAdjPairs(threeStarSentences)
## get most common noun adjective pairs for three star reviews
c = Counter(nounAdjPairs3)
print (c.most_common(10))

[(('service', 'good'), 3), (('places', 'other'), 3), (('it', 'good'), 2), (('sandwich', 'whole'), 1), (('sandwich', 'underwhelming'), 1), (('sandwich', 'small'), 1), (('place', 'overall'), 1), (('fries', 'large'), 1), (('burgers', 'vegetarian'), 1), (('style', 'Mexican'), 1)]


### 4 star reviews

In [16]:
## get twenty four star reviews from unique businesses
rating4TwentyUniqueBis = getReviews(20, 4)
## visualise sentences in four star reviews
fourStarSentences = visualiseData(rating4TwentyUniqueBis)


In [17]:
## get noun adjective pairs in four star reviews
nounAdjPairs4 = getNounAdjPairs(fourStarSentences)
## get most common noun adjective pairs for four star reviews
c = Counter(nounAdjPairs4)
print (c.most_common(10))

[(('food', 'good'), 4), (('thing', 'good'), 3), (('service', 'good'), 2), (('opening', 'soft'), 2), (('it', 'worth'), 2), (('roll', 'Alaskan'), 2), (('weekend', 'Memorial'), 1), (('4miles', 'approx'), 1), (('We', 'able'), 1), (('streets', 'smaller'), 1)]


### 5 star reviews

In [18]:
## get twenty five star reviews from unique businesses
rating5TwentyUniqueBis = getReviews(20, 5)
## visualise sentences in five star reviews
fiveStarSentences = visualiseData(rating5TwentyUniqueBis)


In [19]:
## get noun adjective pairs in five star reviews
nounAdjPairs5 = getNounAdjPairs(fiveStarSentences)
## get most common noun adjective pairs for five star reviews
c = Counter(nounAdjPairs5)
print (c.most_common(10))

[(('food', 'good'), 3), (('time', 'first'), 2), (('food', 'Korean'), 2), (('place', 'great'), 2), (('game', 'hockey'), 2), (('tender', 'worst'), 2), (('time', 'long'), 2), (('food', 'Amazing'), 1), (('it', 'clear'), 1), (('it', 'homemade'), 1)]
