In [8]:
import spacy

In [9]:
nlp = spacy.blank('en')

In [10]:
doc = nlp("Tea is healthy and calming, don't you think?")

In [11]:
print("Token\t\tLemma\t\t Stopword")
print('-'*40)
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")

Token		Lemma		 Stopword
----------------------------------------
Tea		Tea		False
is		is		True
healthy		healthy		False
and		and		True
calming		calming		False
,		,		False
do		do		True
n't		not		True
you		you		True
think		think		False
?		?		False


In [12]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [13]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']

In [14]:
patterns = [nlp(text) for text in terms]
patterns

[Galaxy Note, iPhone 11, iPhone XS, Google Pixel]

In [15]:
matcher.add("TerminologyList", None, *patterns)

In [16]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 

In [17]:
matches = matcher(text_doc)

In [18]:
matches

[(3766102292120407359, 17, 19),
 (3766102292120407359, 22, 24),
 (3766102292120407359, 30, 32),
 (3766102292120407359, 33, 35)]

In [19]:
match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])

TerminologyList iPhone 11


==========================================================

The business owner suggested you use diner reviews from the Yelp website to determine which dishes people liked and disliked. You pulled the data from Yelp. Before you get to analysis, run the code cell below for a quick look at the data you have to work with.

In [20]:
import pandas as pd

In [21]:
data = pd.read_json("restaurant.json")

In [22]:
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
109,lDJIaF4eYRF4F7g6Zb9euw,lb0QUR5bc4O-Am4hNq9ZGg,r5PLDU-4mSbde5XekTXSCA,4,2,0,0,I used to work food service and my manager at ...,2013-01-27 17:54:54
1013,vvIzf3pr8lTqE_AOsxmgaA,MAmijW4ooUzujkufYYLMeQ,r5PLDU-4mSbde5XekTXSCA,4,0,0,0,We have been trying Eggplant sandwiches all ov...,2015-04-15 04:50:56
1204,UF-JqzMczZ8vvp_4tPK3bQ,slfi6gf_qEYTXy90Sw93sg,r5PLDU-4mSbde5XekTXSCA,5,1,0,0,Amazing Steak and Cheese... Better than any Ph...,2011-03-20 00:57:45
1251,geUJGrKhXynxDC2uvERsLw,N_-UepOzAsuDQwOUtfRFGw,r5PLDU-4mSbde5XekTXSCA,1,0,0,0,Although I have been going to DeFalco's for ye...,2018-07-17 01:48:23
1354,aPctXPeZW3kDq36TRm-CqA,139hD7gkZVzSvSzDPwhNNw,r5PLDU-4mSbde5XekTXSCA,2,0,0,0,"Highs: Ambience, value, pizza and deserts. Thi...",2018-01-21 10:52:58


In [23]:
#The owner also gave you this list of menu items and common alternate spellings.
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]


In [24]:
# Create a list of tokens for each item in the menu
menu_tokens_list = [nlp(item) for item in menu]

In [25]:
matcher.add("MENU", None, *menu_tokens_list)
from collections import defaultdict

# item_ratings is a dictionary of lists. If a key doesn't exist in item_ratings,
# the key is added with an empty list as the value.
item_ratings = defaultdict(list)

for idx, review in data.iterrows():
    doc = nlp(review.text)
    # Using the matcher from the previous exercise
    matches = matcher(doc)
    
    # Create a set of the items found in the review text
    found_items = set([doc[match[1]:match[2]]for match in matches])
    
    # Update item_ratings with rating for each item in found_items
    # Transform the item strings to lowercase to make it case insensitive
    for item in found_items:
        item_ratings[str(item).lower()].append(review.stars)



In [45]:
dict_meanratings = {}
for item in item_ratings.items():
    dict_meanratings[item[0]] = [(sum(item[1])/len(item[1])) , len(item[1])]

In [47]:
dict_meanratings

{'chicken parmigiana': [4.444444444444445, 18],
 'eggplant': [3.968421052631579, 95],
 'pizza': [4.304469273743017, 358],
 'steak and cheese': [4.888888888888889, 9],
 'meatball': [4.079754601226994, 163],
 'cannoli': [4.337078651685394, 89],
 'pasta': [4.392156862745098, 255],
 'prosciutto': [4.619047619047619, 63],
 'purista': [4.641791044776119, 67],
 'cheese steak': [4.454545454545454, 88],
 'cheesesteak': [4.335616438356165, 146],
 'calzone': [4.263636363636364, 110],
 'italian combo': [3.909090909090909, 22],
 'tiramisu': [4.2592592592592595, 27],
 'chicken spinach salad': [4.5, 2],
 'italian beef': [4.0, 29],
 'salami': [4.21875, 32],
 'chicken parm': [4.155172413793103, 58],
 'ziti': [4.230769230769231, 26],
 'turkey sandwich': [3.8, 5],
 'chicken cutlet': [3.5454545454545454, 11],
 'tuna salad': [4.0, 5],
 'chicken pesto': [4.566666666666666, 30],
 'lasagna': [4.409638554216867, 83],
 'artichoke salad': [5.0, 5],
 'fettuccini alfredo': [5.0, 6],
 'pizzas': [4.393939393939394, 

In [48]:
dict_meanratings.values()

dict_values([[4.444444444444445, 18], [3.968421052631579, 95], [4.304469273743017, 358], [4.888888888888889, 9], [4.079754601226994, 163], [4.337078651685394, 89], [4.392156862745098, 255], [4.619047619047619, 63], [4.641791044776119, 67], [4.454545454545454, 88], [4.335616438356165, 146], [4.263636363636364, 110], [3.909090909090909, 22], [4.2592592592592595, 27], [4.5, 2], [4.0, 29], [4.21875, 32], [4.155172413793103, 58], [4.230769230769231, 26], [3.8, 5], [3.5454545454545454, 11], [4.0, 5], [4.566666666666666, 30], [4.409638554216867, 83], [5.0, 5], [5.0, 6], [4.393939393939394, 33], [5.0, 1], [4.552631578947368, 38], [4.5, 6], [4.444444444444445, 18], [4.021739130434782, 46], [3.8536585365853657, 41], [4.2105263157894735, 57], [4.111111111111111, 18], [4.8, 5], [4.6875, 16], [4.488888888888889, 45], [4.238095238095238, 21], [4.166666666666667, 6], [4.666666666666667, 6], [4.142857142857143, 7], [5.0, 2]])

In [43]:
# Find the worst item, and write it as a string in worst_text. This can be multiple lines of code if you want.
min_val = min(dict_meanratings.values())
lst = [key for key in dict_meanratings.keys() if dict_meanratings[key] == min_val]
worst_item = str(lst[0])

In [44]:
worst_item

'turkey breast'

In [40]:
# Find the worst item, and write it as a string in worst_text. This can be multiple lines of code if you
max_val = max(dict_meanratings.values())
lst = [key for key in dict_meanratings.keys() if dict_meanratings[key] == max_val]
best_item = str(lst[0])
best_item

'fettuccini alfredo'

#sorting a dictionary
sorted(counts, #dictionary_name
       key=counts.get, # sort by key
       reverse=True) # sort in descending order