In [1]:
import spacy 
from spacy.matcher import PhraseMatcher
import pandas as pd
from collections import defaultdict
import numpy as np
from spacy.util import minibatch
import random

In [None]:
nlp = spacy.load('en_core_web_lg')
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [2]:
doc = nlp("Tea is healthy and calming, don't you think?")

for token in doc:
    print(token)
    
print(f'Token \t\tLemma \t\tStopword'.format('Token', 'Lemma', 'Stopword'))
print('-'*40)
for token in doc:
    print(f'{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}')

Tea
is
healthy
and
calming
,
do
n't
you
think
?
Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calming		False
,		,		False
do		do		True
n't		not		True
you		-PRON-		True
think		think		False
?		?		False


In [3]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add('TerminologyList', None, *patterns)

text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.")

matches = matcher(text_doc)
print(matches)

match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]
TerminologyList iPhone 11


In [4]:
data = pd.read_json('data/restaurant.json')
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]

In [5]:
index = 14
test_text = data.text.iloc[index]
review_doc = nlp(test_text)

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
menu_tokens_list = [nlp(items) for items in menu]

matcher.add('MENU', None, *menu_tokens_list)
matches = matcher(review_doc)

# print(f"Token number {matches[1]}: {review_doc[matches[1]:matches[2]]}")
for match in matches:
    print(f"{match[1]}:\t", review_doc[match[1]:match[2]])

2:	 Purista
16:	 prosciutto
58:	 meatball


In [7]:
item_ratings = defaultdict(list)

for idx, review in data.iterrows():
    doc = nlp(review.text)
    matches = matcher(doc)
    
    found_items = [doc[match[1]:match[2]] for match in matches]
    for item in found_items:
        item_ratings[str(item).lower()].append(review.stars)
        

In [41]:
average_item_ratings = defaultdict(list)
for key in item_ratings:
    average_item_ratings[key] = np.mean(item_ratings[key])

for key, value in reversed(sorted(average_item_ratings.items(), key=lambda item: item[1])):
    print("{:25}: {:3.2f}".format(key, value))
    
worst_item = sorted(average_item_ratings, key=average_item_ratings.get)[0]

corned beef              : 5.00
turkey breast            : 5.00
fettuccini alfredo       : 5.00
artichoke salad          : 5.00
steak and cheese         : 4.89
reuben                   : 4.80
pastrami                 : 4.69
chicken salad            : 4.67
purista                  : 4.64
prosciutto               : 4.62
chicken pesto            : 4.57
calzones                 : 4.55
grilled veggie           : 4.50
chicken spinach salad    : 4.50
gnocchi                  : 4.49
cheese steak             : 4.45
mac and cheese           : 4.44
chicken parmigiana       : 4.44
lasagna                  : 4.41
pizzas                   : 4.39
pasta                    : 4.39
cannoli                  : 4.34
cheesesteak              : 4.34
pizza                    : 4.30
calzone                  : 4.26
tiramisu                 : 4.26
chicken parmesan         : 4.24
ziti                     : 4.23
salami                   : 4.22
italian sausage          : 4.21
macaroni                 : 4.17
chicken 

In [44]:
counts = {item: len(ratings) for item, ratings in item_ratings.items()}
item_counts = sorted(counts, key=counts.get, reverse=True)
for item in item_counts:
    print(f"{item:25}{counts[item]:>5}")
    

pizza                      358
pasta                      255
meatball                   163
cheesesteak                146
calzone                    110
eggplant                    95
cannoli                     89
cheese steak                88
lasagna                     83
purista                     67
prosciutto                  63
chicken parm                58
italian sausage             57
garlic bread                46
gnocchi                     45
spaghetti                   41
calzones                    38
pizzas                      33
salami                      32
chicken pesto               30
italian beef                29
tiramisu                    27
ziti                        26
italian combo               22
chicken parmesan            21
chicken parmigiana          18
mac and cheese              18
portobello                  18
pastrami                    16
chicken cutlet              11
steak and cheese             9
roast beef                   7
fettucci

In [53]:
sorted_ratings = sorted(average_item_ratings, key=average_item_ratings.get)
print('Worst rated menu items:')
for item in sorted_ratings[:10]:
    print(f'{item:25} AR: {average_item_ratings[item]:3.2f} \tC: {counts[item]}')
    
print('\nBest rated menu items:')
for item in reversed(sorted_ratings[-10:]):
    print(f'{item:25} AR: {average_item_ratings[item]:3.2f} \tC: {counts[item]}')
    

Worst rated menu items:
chicken cutlet            AR: 3.55 	C: 11
turkey sandwich           AR: 3.80 	C: 5
spaghetti                 AR: 3.85 	C: 41
italian combo             AR: 3.91 	C: 22
eggplant                  AR: 3.97 	C: 95
italian beef              AR: 4.00 	C: 29
tuna salad                AR: 4.00 	C: 5
garlic bread              AR: 4.02 	C: 46
meatball                  AR: 4.08 	C: 163
portobello                AR: 4.11 	C: 18

Best rated menu items:
corned beef               AR: 5.00 	C: 2
turkey breast             AR: 5.00 	C: 1
fettuccini alfredo        AR: 5.00 	C: 6
artichoke salad           AR: 5.00 	C: 5
steak and cheese          AR: 4.89 	C: 9
reuben                    AR: 4.80 	C: 5
pastrami                  AR: 4.69 	C: 16
chicken salad             AR: 4.67 	C: 6
purista                   AR: 4.64 	C: 67
prosciutto                AR: 4.62 	C: 63


In [59]:
sorted_counts = sorted(counts, key=counts.get)
print('Least reviewed menuitems:')
for item in sorted_counts[:10]:
    print(f'{item:25} C: {counts[item]:<7} AR: {average_item_ratings[item]:3.2f}')
    
print('\nMost reviwed menu items:')
for item in reversed(sorted_counts[-10:]):
    print(f'{item:25} C: {counts[item]:<7} AR: {average_item_ratings[item]:3.2f}')

Least reviewed menuitems:
turkey breast             C: 1       AR: 5.00
chicken spinach salad     C: 2       AR: 4.50
corned beef               C: 2       AR: 5.00
turkey sandwich           C: 5       AR: 3.80
tuna salad                C: 5       AR: 4.00
artichoke salad           C: 5       AR: 5.00
reuben                    C: 5       AR: 4.80
fettuccini alfredo        C: 6       AR: 5.00
grilled veggie            C: 6       AR: 4.50
macaroni                  C: 6       AR: 4.17

Most reviwed menu items:
pizza                     C: 358     AR: 4.30
pasta                     C: 255     AR: 4.39
meatball                  C: 163     AR: 4.08
cheesesteak               C: 146     AR: 4.34
calzone                   C: 110     AR: 4.26
eggplant                  C: 95      AR: 3.97
cannoli                   C: 89      AR: 4.34
cheese steak              C: 88      AR: 4.45
lasagna                   C: 83      AR: 4.41
purista                   C: 67      AR: 4.64


In [2]:
spam = pd.read_csv('data/spam.csv')
nlp = spacy.blank('en')

textcat = nlp.create_pipe('textcat', config={'exclusive_classes': True, 'architechture': 'bow'})
nlp.add_pipe(textcat)

textcat.add_label('ham')
textcat.add_label('spam')

train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham', 'spam': label == 'spam'}}
                for label in spam['label']]

train_data = list(zip(train_texts, train_labels))

In [3]:
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

batches = minibatch(train_data, size=8)

for batch in batches:
    text, labels = zip(*batch)
    nlp.update(text, labels, sgd=optimizer)
    

In [4]:
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    batches = minibatch(train_data, size=8)
    for batch in batches:
        text, labels = zip(*batch)
        nlp.update(text, labels, sgd=optimizer, losses=losses)
    print(losses)
    

{'textcat': 0.21975726346467503}
{'textcat': 0.318620875587339}
{'textcat': 0.39770351629094536}
{'textcat': 0.43069386248883706}
{'textcat': 0.45787767076693225}
{'textcat': 0.45834652515703533}
{'textcat': 0.45834678759302133}
{'textcat': 0.4583468100206913}
{'textcat': 0.4583468193062615}
{'textcat': 0.45834682363745427}


In [6]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]

docs = [nlp.tokenizer(text) for text in texts]

textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

predicted_lables = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_lables])

[[9.9999940e-01 6.2277752e-07]
 [3.3817263e-04 9.9966180e-01]]
['ham', 'spam']


In [2]:

def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file)
    
    # Shuffle data
    train_data = data.sample(frac=1, random_state=7)
    
    texts = train_data.text.values
    labels = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)}
              for y in train_data.sentiment.values]
    split = int(len(train_data) * split)
    
    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels

train_texts, train_labels, val_texts, val_labels = load_data('data/yelp_ratings.csv')

print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
train_labels[:2]

Texts from training data
------
["Some of the best sushi I've ever had....and I come from the East Coast.  Unreal toro, have some of it's available."
 "One of the best burgers I've ever had and very well priced. I got the tortilla burger and is was delicious especially with there tortilla soup!"]

Labels from training data
------


[{'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}}]

In [3]:
nlp = spacy.blank('en')

textcat = nlp.create_pipe('textcat', config={'exclusive_classes': True, 'architechture': 'bow'})
nlp.add_pipe(textcat)

textcat.add_label('NEGATIVE')
textcat.add_label('POSITIVE')

def train(model, train_data, optimizer):
    
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        texts, labels = zip(*batch)
        model.update(texts, labels, sgd=optimizer, losses=losses)
    return losses
        
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])


8.859361211946009


In [25]:
text = "This tea cup was full of holes. Do not recommend."
doc = nlp(text)
print(doc.cats)

{'NEGATIVE': 0.9836472868919373, 'POSITIVE': 0.016352694481611252}


In [45]:
def predict(model, texts):
    docs = [model.tokenizer(text) for text in texts]
    
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(docs)

    predicted_labels = scores.argmax(axis=1)
    return predicted_labels

texts = val_texts[34:38]
predictions = predict(nlp, texts)

for p, t in zip(predictions, texts):
    print(f"{textcat.labels[p]}: {t} \n")
    

POSITIVE: Came over and had their "Pick 2" lunch combo and chose their best selling 1/2 chicken sandwich with quinoa.  Both were tasty, the chicken salad is a bit creamy but was perfect with quinoa on the side.  This is a good lunch joint, casual and clean! 

POSITIVE: Went here last night and got oysters, fried okra, fries, and onion rings. I cannot complain. The portions were great and tasty!!! I will definitely be back for more. I cannot wait to try the crawfish boudin and soft shell crab. 

POSITIVE: This restaurant was fantastic! 
The concept of eating without vision was intriguing. The dinner was filled with laughs and good conversation. 

We were lead in a line to our table and each person to their seat. This was not just dark but you could not see something right in front of your face. 

The waiters/waitresses were all blind and allowed us to see how aware you need to be without the vision. 

Taking away one sense is said to increase your other senses so as taste and hearing wh

In [48]:
def evaluate(model, tt, ll):
    """ Returns the accuracy of a TextCategorizer model. 

        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function

    """
    predicted_class = predict(model, tt)
    true_class = [int(each['cats']['POSITIVE']) for each in ll]
    correct_predictions = predicted_class == true_class
        
    accuracy = correct_predictions.mean()
    return accuracy
    

accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9499


In [None]:
epochs = 5
for i in range(epochs):
    losses = train(nlp, train_data, optimizer)
    accuracy = evaluate(nlp, val_texts, val_labels)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f}")
    


Loss: 3.685 	 Accuracy: 0.946
