# Import Benepar


In [1]:
import benepar
benepar.download("benepar_en2")
parser = benepar.Parser("benepar_en2")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package benepar_en2 to
[nltk_data]     C:\Users\jyzho\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en2 is already up-to-date!


# Preprocess data

In [10]:
def preprocess(reviews, brandlist, sample_size=2000, validation_size=0.1, 
               test_size=0.25, verbose=0, **kwargs):
    """Function that generates the dataset for Spacy training. 
    
    Input: Yelp dataset
    Output: train/test CSV for ER model training
    Parameters:
    - reviews: pandas dataframe of reviews
    - brandlist: pandas dataframe containing list of products/brands
    - sample_size: total number of reviews to subset
    - validation_size: proportion of total sample_size to validate on
    - test_size: proportion of total sample_size that will serve as the test set
    
    NOTE 
    ----
    config.data_path: workspace/data
    You should use workspace/data to put data to working on.  Let's say
    you have workspace/data/iris.csv, which you downloaded from:
    https://archive.ics.uci.edu/ml/datasets/iris. You will generate
    the following:
    + workspace/data/test.csv
    + workspace/data/train.csv
    + workspace/data/validation.csv
    + other files
    With these files you can train your model!
    """
    if verbose == 1:
      print("==> GENERATING DATASETS FOR TRAINING YOUR MODEL")

    # Convert brands in brand list to lowercase
    brandlist.word = brandlist.word.str.lower()

    # Extract a sample of reviews to generate training/validation/test data from
    sample = reviews.sample(n=sample_size)

    # Convert reviews to format relevant for spacy training
    if verbose == 1:
      print("   ===> CONVERTING DATA FOR SPACY")
    train_data = []
    print("LENGTH OF DATASET: ", len(sample))
    for index, row in tqdm(sample.iterrows()):
        # print(index)
        brands_tmp = []
        for brand in brandlist.word:
            text = row.text.lower()
            start_index = 0
            while start_index < len(text):
                start_index = text.find(brand, start_index)
                end_index = start_index + len(brand)
                if start_index == -1:
                    break
                if not text[start_index-1].isalpha() and (end_index == len(text) or not text[end_index].isalpha()):
                    if brand not in ['place', 'restaurant', 'cafe', 'establishment', 'diner']:
                        brands_tmp.append((start_index, end_index, "PRODUCT"))
                    else:
                        brands_tmp.append((start_index, end_index, "PRODUCT"))

                start_index += len(brand)
        train_data.append((row.review_id, row.text, brands_tmp))

    result = pd.DataFrame(train_data, columns=['review_id', 'text', 'entities'])

    # Split processed data into train/validation/test sets
    if verbose == 1:
      print("   ===> SPLITTING INTO TRAIN/VALIDATION/TEST SETS")
    train_validation, test = train_test_split(result, test_size=test_size)
    train, validation = train_test_split(train_validation, test_size=validation_size / (1-test_size))

    # Output to CSV in data folder
    train.to_csv('../workspace/data/train.csv')
    validation.to_csv('../workspace/data/validation.csv')
    test.to_csv('../workspace/data/test.csv')
    
    if verbose == 1:
      print("==> DATASETS GENERATED")
    
    return train, validation, test

# Training entity recognition model function

In [11]:
from __future__ import unicode_literals, print_function
import ast 
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import time
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


def create_train_data(df):
    train_data = []
    newnlp = spacy.load("en_core_web_sm")

    for i in range(len(df)):
        doc = newnlp(df['text'].iloc[i])
        entity_list = df['entities_clean'].iloc[i]
        for ent in doc.ents:
            entity_list.append((ent.start_char, ent.end_char, ent.label_))
        entity_dict = {"entities": entity_list}
        train_data.append((df['text'].iloc[i], entity_dict))
    return train_data

def create_test_data(df):
    test_data = []
    newnlp = spacy.load("en_core_web_sm")

    for i in range(len(df)):
        doc = newnlp(df['text'].iloc[i])
        entity_list = df['entities_clean'].iloc[i]
        for ent in doc.ents:
            entity_list.append((ent.start_char, ent.end_char, ent.label_))
        entity_dict = {"entities": entity_list}
        test_data.append((df['text'].iloc[i], entity_dict))
    return test_data


# new entity label
def train(train_data, test_data, LABEL, model='en_core_web_sm', new_model_name="product", output_dir='../ermodel', n_iter=1):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch the examples using spaCy's minibatch
        start = time.time()
        for itn in range(n_iter):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)            
            #print("Training Recall:",nlp.evaluate(random.sample(TRAIN_DATA,200)).ents_r)
            #print("Test Recall:",nlp.evaluate(TEST_DATA).ents_p) #COMMENT: isn't this precision?
            #COMMENT: so test data here is evaluating test_data which has the format 
            # of e.g. ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}) right
            #print("Training Losses", losses)
        end = time.time()
    print("Total training time:",end-start)

    # test the trained model (small sample test)
    for i in range(10):
        test_text = test_data[i][0]
        doc = nlp(test_text)
        print("Entities in '%s'" % test_text)
        for ent in doc.ents:
            print(ent.label_, ent.text)

    # TODO: Abstract to another function
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # COMMENT: Abstract to another function 
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
    return nlp


def run_training(file_name = "../workspace/data/train.csv", 
                 output_dir = '../workspace/models/er_model'):

    print("   ==> CONFIGURING FORMAT FOR SPACY TRAINING")

    df = pd.read_csv(file_name)
    df['entities_clean']=[ast.literal_eval(i) for i in df['entities']]
    #train_df, test_df = train_test_split(df, test_size = .2)
    all_train, _ = train_test_split(df, train_size=250)
    train_df, test_df = train_test_split(all_train, test_size=.2)
    
    # new entity label
    LABEL = "PRODUCT"
    
    TRAIN_DATA = create_train_data(train_df)
    TEST_DATA = create_test_data(test_df)

    print("   ==> TRAINING...")

    model = train(TRAIN_DATA, TEST_DATA, LABEL=LABEL, output_dir=output_dir)




In [12]:
def main_train(**kwargs):
    """Function that will run your model, be it a NN, Composite indicator
    or a Decision tree, you name it.

    NOTE
    ----
    config.models_path: workspace/models
    config.data_path: workspace/data

    As convention you should use workspace/data to read your dataset,
    which was build from generate() step. You should save your model
    binary into workspace/models directory.
    """
    print("==> TRAINING YOUR SPACY MODEL!")

    # TODO: Load data from workspace/data
    # TODO: Save trained model to workspace/models
    run_training()

# Retrieve Entity Function

In [13]:
def get_entities(nlp_model, text):
    """
    Input nlp_model and text, retrieve a list of unique entities from the text.
    """
    doc = nlp_model(text)
    entities = set()
    for ent in doc.ents:
        if ent.label_ == "PRODUCT":
            entities.add(ent.text)
    return list(entities)

# Sentiment Helper Code

In [14]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections
from tqdm import tqdm
from collections import defaultdict
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import benepar
import re
import spacy
from joblib import Parallel, delayed
import multiprocessing

class Predictor:
    def __init__(self,
                 sentiment_package = "vader",
                 parse_package = "benepar",
                 model_dir = "../workspace/models/er_model"):
        
        self.sentiment_package = sentiment_package
        self.nlp = spacy.load(model_dir)
        self.num_cores = multiprocessing.cpu_count()
        
        if parse_package == 'benepar':
            try:
                self.parser = benepar.Parser("benepar_en2") 
            except LookupError:
                benepar.download('benepar_en2')
                self.parser = benepar.Parser("benepar_en2")
        elif parse_package == 'stanford':   
            pass
        else:
            raise Exception('incorrect parse package')
        
    def _remove_nestings(self, lst): 
        output = []
        
        def _remove_nestings_recursive(l):
            for i in l: 
                if type(i) == list: 
                    _remove_nestings_recursive(i) 
                else: 
                    output.append(i)
        
        _remove_nestings_recursive(lst)
        
        return output
    
    def _continue_splitting(self, review, list_of_dividers):    
        temp = list_of_dividers.copy()
        l = [review]
        while len(temp) > 0:
            divider = temp.pop(0)
            l_new = []
            for i in l:
                l_new += i.split(divider)
            l = l_new
        return l
    
    
    def join_clause(self, review, list_of_split_clauses, list_of_dividers):
        output = []
        loc_of_split_clauses = []
        for clause in list_of_split_clauses:
            loc_of_split_clauses.append(review.find(clause))
        for divider in list_of_dividers:
            print(divider)
            loc_div = review.find(divider)
            print(loc_div)
            for i in range(len(loc_of_split_clauses)):
                if loc_div > loc_of_split_clauses[i]:
                    print(loc_div,loc_of_split_clauses[i])
    
    
    def join_partitions(self, long_review,entity_with_review):
        loclist = []
        for (_, clause) in entity_with_review:
            loclist.append((long_review.find(clause),long_review.find(clause)+len(clause)))
        starts = {i for (i,j) in loclist}
        ends = {j for (i,j) in loclist}
        starts.add(len(long_review))
        newends = {}
        for i in ends:
            newends[i] = min([x for x in starts if x >= i])
        for i in newends:
            pass
        new_entity_with_review = []
        for i in range(len(loclist)):
            tup = loclist[i]
            entity = entity_with_review[i][0]
            st = tup[0]
            en = newends[tup[1]]
            new_entity_with_review.append((entity,long_review[st:en]))
        return new_entity_with_review
    
    
    def split_long_string(self, review):
        num = len(review)
        split_list = []
        start = 0
        end = 0
        while num != end:
            #if one step away from end of review
            if num - end < 1000:
                end = num
                split_list.append(review[start:end])
            
            #otherwise, find the last full stop
            else:
                end = review[start:(start+1000)].rfind('.') + start
                if end == -1:
                    end = review[start:(start+1000)].rfind(' ') + start #if no '.', space will do
                    if end == -1:
                        end = min(start + 1000,num) + start #if there still isn't, then we simply split
                split_list.append(review[start:end])
                start = end
        return(split_list)
    
    
    def split_very_long_string(self, review):
        num = len(review)
        split_list = []
        start = 0
        end = 0
        while num != end:
            #if one step away from end of review
            if num - end < 1000:
                end = num
                split_list.append(review[start:end])
            
            #otherwise, find the last full stop
            else:
                end = review[start:(start+400)].rfind('.') + start
                if end == -1:
                    end = review[start:(start+400)].rfind(' ') + start #if no '.', space will do
                    if end == -1:
                        end = min(start + 400,num) + start #if there still isn't, then we simply split
                split_list.append(review[start:end])
                start = end
        return(split_list)
    
    
    def split_review_naive(self, review,entities):
        clauses = re.split('[.?!]',review)
        lenlist = [len(x) for x in clauses]
        clauses = [x for _, x in sorted(zip(lenlist,clauses),reverse=False)]
        entity_with_clause = []
        for entity in entities:
            for clause in clauses:
                if entity in clause:
                    entity_with_clause.append((entity,clause))
                    break
        return(self.join_partitions(review,entity_with_clause))
    
    
    def min_tree(self, review, entities, output = 'minimum'):
        
        #review is string, entities is list of strings, parser is parser object
        #possible outputs: no_parse, minimum, partition, all
        
        if output == 'no_parse':
            return(self.split_review_naive(review,entities))
            
        treelist = []
        lenlist = []
        temp = review.split('\n')
        
        if len(review) > 1000:
            split_reviews = self.split_long_string(review)
        else:
            split_reviews = [i for i in temp if len(i) > 1 and len(i) <= 1000 ]
        
        #if output is partition, we need to keep track of the full review
        if output == 'partition':
            full_review = ''
        
        #constituency parsers
        
        for rev in split_reviews:
            if rev and rev.strip():
                u = self.parser.parse(rev) # tree 
    
                if type(u) == str:
                    u = nltk.Tree.fromstring(u)
    
                for s in u.subtrees(): # subtrees 
                    if s.label() == 'S': # if sentence
                        treelist += [s]
                        lenlist += [len(s.leaves())] # how long clause
                            
                if output == 'partition':
                    full_review += ' '.join(u.leaves()) + ' '
    
        treelist = [x for _, x in sorted(zip(lenlist,treelist),reverse=False)] # sort by lenlisit
        clauses = [' '.join(tree.leaves()) for tree in treelist]
        
        #If there is no sentences detected, then the full review is the only clause.
        if not clauses:
            if output == 'partition':
                clauses.append(full_review)
            else:
                clauses.append(review)
        entity_with_clause = []
        
        if output == 'all':
            for entity in entities:
                clauselist = []
                for clause in clauses:
                    if entity in clause:
                        clauselist.append(clause)
                entity_with_clause.append((entity,clauselist))
        
        #TODO: create rules and test them
        elif output == 'minimum':
            for entity in entities:
                for clause in clauses:
                    if entity in clause:
                        entity_with_clause.append((entity,clause))
                        break
                        
        elif output == 'partition':
            #first find minimal clause
            for entity in entities:
                for clause in clauses:
                    if entity in clause:
                        entity_with_clause.append((entity,clause))
                        break
            #get location of minimal clause in review
            
            entity_with_clause = self.join_partitions(full_review,entity_with_clause)
        
        return entity_with_clause
    
    
    def dependency_tree(self, review, entities, output = 'split_min'):
        #possible output = split_min, split_all, tree_min, tree_all -> split only uses sentence splitter, while tree takes into account tree structure
        doc = self.parser(review)
        
        if output == 'split_min' or output == 'split_all' or output == 'split_part':
            clauses = list(doc.sents)
        #length of every clause
        
        lenlist = [len(str(x)) for x in clauses]
            
        #sort
        clauses = [str(x) for _, x in sorted(zip(lenlist,clauses),reverse=False)]
        
        
        entity_with_clause = []
        
        if output == 'split_min':
            for entity in entities:
                for clause in clauses:
                    if entity in clause:
                        entity_with_clause.append((entity,clause))
                        break
                        
        if output == 'split_all':
            for entity in entities:
                clauselist = []
                for clause in clauses:
                    if entity in clause:
                        clauselist.append(clause)
                entity_with_clause.append((entity,clauselist))
        
        if output == 'split_part':
            for entity in entities:
                for clause in clauses:
                    if entity in clause:
                        entity_with_clause.append((entity,clause))
                        break
            #get location of minimal clause in review
            
            entity_with_clause = self.join_partitions(review,entity_with_clause)
                
        
        return(entity_with_clause)


    def vader_sentiment(self, entity_with_clause):
        analyzer = SentimentIntensityAnalyzer()
        entity_with_sentiment = []
        for entity, clause in entity_with_clause:
            sentiment = analyzer.polarity_scores(clause)['compound']
            entity_with_sentiment.append((entity,sentiment))
        return(entity_with_sentiment)   


    def sentiment_analysis(self, entity_with_review, 
                           sentiment_package = 'stanford'):
        #takes in list of tuples
        if sentiment_package == 'stanford':
            return stanford_sentiment(entity_with_review)
        elif sentiment_package == 'vader':
            return self.vader_sentiment(entity_with_review)
        else:
            raise Exception('incorrect sentiment package')


    def sentiment_analysis_indiv(self, clause,sentiment_package = 'stanford'):
        #takes in a single review
        if sentiment_package == 'stanford':
            stanford_sentiment_start()
            result = nlp.annotate(clause,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json'
                       })
            return np.dot(result['sentences'][0]['sentimentDistribution'], [-2, -1, 0, 1, 2])
        elif sentiment_package == 'vader':
            analyzer = SentimentIntensityAnalyzer()
            return analyzer.polarity_scores(clause)['compound']
        else:
            raise Exception('incorrect sentiment package')

    
    def rule_1(self, review, entities):
        entity_with_review = self.min_tree(review, entities, output = 'minimum')
        entity_with_sentiment = self.sentiment_analysis(entity_with_review, 
                                                        self.sentiment_package)
        return entity_with_sentiment
    
    
    def rule_2(self, review, entities):
        entity_with_review = self.min_tree(review, entities, output = 'all')
        entity_with_sentiment = []
        sentiment = 0
        for ent, revlist in entity_with_review:
            for clause in revlist:
                sentiment = self.sentiment_analysis_indiv(clause,self.sentiment_package)
                if self.sentiment_package == 'vader' and sentiment != 0:
                    break
                elif self.sentiment_package == 'stanford' and abs(sentiment) > 0.5:
                    break
                    #if sentiment is not neutral, stop. If sentiment is neutral, keep going up tree.                    
            entity_with_sentiment.append((ent,sentiment))
        return entity_with_sentiment
    
    
    def rule_3(self, review, entities):
        entity_with_review = self.min_tree(review, entities, output = 'all')
        
        entity_with_sentiment = []
        for ent, revlist in entity_with_review:
            sentiment_list = []
            for clause in revlist:
                sentiment = self.sentiment_analysis_indiv(clause,self.sentiment_package)
                sentiment_list.append(sentiment)
            if not sentiment_list:
                sentiment_list.append(0)
            entity_with_sentiment.append((ent,np.mean(sentiment_list)))
        
        return entity_with_sentiment
    
    
    def rule_4(self, review, entities):
        entity_with_review = self.min_tree(review, entities, output = 'partition')
        entity_with_sentiment = self.sentiment_analysis(entity_with_review, self.sentiment_package)
        return entity_with_sentiment
    
    
    def rule_5(self, review, entities):
        entity_with_review = self.min_tree(review, entities, output = 'minimum')
        entity_with_review_p = self.min_tree(review, entities, output = 'partition')
        
        entity_with_sentiment = self.sentiment_analysis(entity_with_review, self.sentiment_package)
        for i in range(len(entity_with_sentiment)):
            sent = entity_with_sentiment[i][1]
            if self.sentiment_package == 'vader' and sent != 0:
                entity_with_sentiment[i] = (entity_with_sentiment[i][0],
                                            self.sentiment_analysis_indiv(entity_with_review_p[i][1],
                                                                          self.sentiment_package))
            elif self.sentiment_package == 'stanford' and abs(sent) > 0.5:
                entity_with_sentiment[i] = (entity_with_sentiment[i][0],
                                            self.sentiment_analysis_indiv(entity_with_review_p[i][1],
                                                                          self.sentiment_package))
    
        return entity_with_sentiment
    
    
    def rule_6(self, review, entities):
        entity_with_review = self.min_tree(review, entities, output = 'no_parse')
        return entity_with_review
    
    def rule_7(self, review, entities):
        self.parser = spacy.load("en_core_web_sm")
        entity_with_review = self.dependency_tree(review, entities, output = 'split_min')
        entity_with_sentiment = self.sentiment_analysis(entity_with_review, 
                                                        self.sentiment_package)
        return entity_with_sentiment
    
    def rule_8(self, review, entities):
        self.parser = spacy.load("en_core_web_sm")
        
        entity_with_review = self.dependency_tree(review, entities, output = 'split_all')
        new_entity_with_review = []
        entity_with_sentiment = []
        sentiment = 0
        for ent, revlist in entity_with_review:
            for clause in revlist:
                sentiment = self.sentiment_analysis_indiv(clause,self.sentiment_package)
                if self.sentiment_package == 'vader' and sentiment != 0:
                    new_entity_with_review.append((ent,clause))
                    break
                elif self.sentiment_package == 'stanford' and abs(sentiment) > 0.5:
                    new_entity_with_review.append((ent,clause))
                    break
                    #if sentiment is not neutral, stop. If sentiment is neutral, keep going up tree.                    
            entity_with_sentiment.append((ent,sentiment))
            
        entity_with_review = new_entity_with_review
        entity_with_sentiment = self.sentiment_analysis(entity_with_review, 
                                                        self.sentiment_package)
        return entity_with_sentiment 
        
    def rule_9(self, review, entities):
        self.parser = spacy.load("en_core_web_sm")
        
        entity_with_review = self.dependency_tree(review, entities, output = 'split_min')
        entity_with_review_p = self.dependency_tree(review, entities, output = 'split_part')
        
        entity_with_sentiment = self.sentiment_analysis(entity_with_review, self.sentiment_package)
        for i in range(len(entity_with_sentiment)):
            sent = entity_with_sentiment[i][1]
            if self.sentiment_package == 'vader' and sent != 0:
                entity_with_review[i] = entity_with_review_p[i]
            elif self.sentiment_package == 'stanford' and abs(sent) > 0.5:
                entity_with_review[i] = entity_with_review_p[i]
                
        return entity_with_sentiment
    
    
    
    def kill_host(self):
        if self.sentiment_package == "stanford":
            self.parser.kill_host()
        else:
            print("Stanford server not initialized")
            
            
    def get_entities(self, text):
        """
        Input nlp_model and text, retrieve a list of unique entities from the text.
        """
        doc = self.nlp(text)
        entities = set()
        for ent in doc.ents:
            if ent.label_ == "PRODUCT":
                entities.add(ent.text)
        return list(entities)
    
    
    def _parallelize_default(self, review):
        entities = self.get_entities(review)    
        result = self.rule_2(review, entities)
        return result
    
    
    def parallelize_predict(self, input_data):
        input_data = self.assert_list_form(input_data)
        entities_with_sentiment = Parallel(n_jobs=self.num_cores)(delayed(self._parallelize_default)(i) for i in input_data)
        return entities_with_sentiment
    
    # default using rule 2 for prediction
    def defaultPredict(self, input_data):
        entities_with_sentiment = []

        input_data = self.assert_list_form(input_data)

        for review in tqdm(input_data):
            entities = self.get_entities(review)    
            result = self.rule_2(review, entities)
            entities_with_sentiment.append(result)
        return entities_with_sentiment
        
    def assert_list_form(self, input_data):
        if not isinstance(input_data, list):
            input_data = [input_data]

        assert isinstance(input_data, list)
        assert isinstance(input_data[0], str) 

        return input_data
    
    def customPredict(self, input_data, rule_number=2):
        entities_with_sentiment = []

        input_data = self.assert_list_form(input_data)

        for review in input_data:
            entities = self.get_entities(review)
            if rule_number == 1:
                result = self.rule_1(review, entities)
            elif rule_number == 2:
                result = self.rule_2(review, entities)
            elif rule_number == 3:
                result = self.rule_3(review, entities)
            elif rule_number == 4:
                result = self.rule_4(review, entities)
            elif rule_number == 5:
                result = self.rule_5(review, entities)
            elif rule_number == 6:
                result = self.rule_6(review, entities)
            elif rule_number == 7:
                result = self.rule_7(review, entities)
            elif rule_number == 8:
                result = self.rule_8(review, entities)
            elif rule_number == 9:
                result = self.rule_9(review, entities)
            else:
                raise Exception('Rule number invalid, please choose something between 1 and 9')
                
            entities_with_sentiment.append(result)
            
        return entities_with_sentiment
    

# Spacy validation code

In [15]:
# -*- coding: utf-8 -*-
"""spacy_validate.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1KTr0oUxy27VOldpmjxfs-zf5nGwIwmaf
"""
from __future__ import unicode_literals, print_function
import ast 
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import time
import random
from sklearn.model_selection import train_test_split
import pandas as pd

def create_train_data(df):
    train_data = []
    newnlp = spacy.load("en_core_web_sm")

    for i in range(len(df)):
        doc = newnlp(df['text'].iloc[i])
        entity_list = df['entities_clean'].iloc[i]
        for ent in doc.ents:
            entity_list.append((ent.start_char, ent.end_char, ent.label_))
            entity_dict = {"entities": entity_list}
            train_data.append((df['text'].iloc[i], entity_dict))
    return train_data

def create_test_data(df):
    test_data = []
    newnlp = spacy.load("en_core_web_sm")

    for i in range(len(df)):
        doc = newnlp(df['text'].iloc[i])
        entity_list = df['entities_clean'].iloc[i]
        for ent in doc.ents:
            entity_list.append((ent.start_char, ent.end_char, ent.label_))
        entity_dict = {"entities": entity_list}
        test_data.append((df['text'].iloc[i], entity_dict))
    return test_data

def create_masked_train_data(df, masked_entities):
    train_data = []
    newnlp = spacy.load("en_core_web_sm")
  
    for i in range(len(df)):
        doc = newnlp(df['text'].iloc[i])
        entity_list = df['entities_clean'].iloc[i]
        for ent in doc.ents:
            if ent.text not in masked_entities:
                entity_list.append((ent.start_char, ent.end_char, ent.label_))
        entity_dict = {"entities": entity_list}
        train_data.append((df['text'].iloc[i], entity_dict))
    return train_data

def masked_train_test(train, test):
    brand_list = []
    for (index,entity_loc) in enumerate(train['entities_clean']):
        text = train['text'].iloc[index]
        for pair in entity_loc:
            brand_list.append(text[pair[0]:pair[1]])
      
    import numpy as np
    unique_brands = np.unique(brand_list)

    newbrand_list = []
    for (index, entity_loc) in enumerate(test['entities_clean']):
        text = test['text'].iloc[index]
        for pair in entity_loc:
            newbrand_list.append(text[pair[0]:pair[1]])
      
    import numpy as np
    newunique_brands = np.unique(newbrand_list)

    in_common = list(set(unique_brands) & set(newunique_brands))
    print("Total in common:",len(in_common))

    masked_entities, unmasked_entities = train_test_split(in_common, test_size = .5)
    print("Total masked:", len(masked_entities))

    # new entity label
    TRAIN_DATA = create_masked_train_data(train, masked_entities)
    TEST_DATA = create_test_data(test)
    return TRAIN_DATA, TEST_DATA, masked_entities, unique_brands, newunique_brands

def evaluate_novelty(trained_model, masked_train_data, masked_test_data, masked_entities, unmasked_train_data, unmasked_test_data):
    nomask_true = {}
    nomask = {}

    for review in unmasked_test_data:
        test_ents_true = [review[0][start:end] for (start, end, label) in review[1]['entities']]
        doc = trained_model(review[0])
        test_ents = [ent.text for ent in doc.ents]

        for entity in masked_entities:
            if entity in test_ents_true: 
                if (entity in test_ents):
                    if entity in nomask.keys():
                        nomask[entity] += 1
                        nomask_true[entity] +=1
                    else: nomask_true[entity] = 0; nomask[entity]=0
                elif entity in nomask_true.keys(): nomask_true[entity]+=1
                else: nomask_true[entity] = 0

    mask_true = {}
    mask = {}

    for review in masked_test_data:
        test_ents_true = [review[0][start:end] for (start, end, label) in review[1]['entities']]
        doc = trained_model(review[0])
        test_ents = [ent.text for ent in doc.ents]

    for entity in masked_entities:
        if entity in test_ents_true: 
            if (entity in test_ents):
                if entity in mask.keys():
                    mask[entity] += 1
                    mask_true[entity] +=1
                else: mask_true[entity] = 0; mask[entity]=0
            elif entity in mask_true.keys(): mask_true[entity]+=1
            else: mask_true[entity] = 0

    ratios_without_mask = {}
    for key in nomask.keys():
        if nomask_true[key] !=0:
            ratios_without_mask[key] = nomask[key]/nomask_true[key]
    ratios = {}
    for key in mask.keys():
        if mask_true[key] !=0:
            ratios[key] = mask[key]/mask_true[key]

    difference = {}
    for keys in ratios_without_mask:
        difference[keys] =  ratios[keys] - ratios_without_mask[keys]
    return difference, ratios, ratios_without_mask

def evaluate_spacy(trained_model_dir='../workspace/models/er_model', dataset_path="../workspace/data/test.csv", verbose=True):
    df = pd.read_csv(dataset_path)
    df['entities_clean']=[ast.literal_eval(i) for i in df['entities']]
    train_df, test_df = train_test_split(df, test_size = .2)
    trained_model = spacy.load(trained_model_dir)
    LABEL = "PRODUCT"
    masked_TRAIN_DATA, masked_TEST_DATA, masked_entities, unique_brands, newunique_brands = masked_train_test(train_df, test_df)

    TRAIN_DATA = create_train_data(train_df)
    TEST_DATA = create_test_data(test_df)

    difference, ratios, ratios_without_mask = evaluate_novelty(trained_model, masked_TRAIN_DATA,masked_TEST_DATA,masked_entities, TRAIN_DATA,TEST_DATA)
    if verbose == True:
        print('DIFFERENCES')
        print(difference)
        print('RATIOS WITH MASK')
        print(ratios)
        print('RATIOS WITHOUT MASK')
        print(ratios_without_mask)
    d = {'difference': difference, 'ratios with mask':ratios,'ratios without mask': ratios_without_mask}
    df = pd.DataFrame(data=d)
    return df

# Running End-to-End Validation

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from tqdm.notebook import tqdm
from collections import defaultdict

In [17]:
# GET ALL RESTAURANTS 

# import data
df_raw = pd.read_json("../data/restaurant_reviews_1900k.json", lines=True)

# only get restaurants with many reviews
many_reviews = df_raw[['business_id','review_id']].groupby("business_id")['review_id'].nunique()
many_reviews = many_reviews[many_reviews > 1000].index # more than 100 reviews
df = df_raw[df_raw.business_id.isin(set(many_reviews))]
print("Number of businesses in subset: ", len(df.business_id.unique()))

# only grab restaurants with 3-4 stars
business_stars = df[['business_id', 'stars']].groupby('business_id').mean()
business_ids_similar_stars= business_stars[
    (business_stars.stars >= 3.0) 
    & (business_stars.stars <= 4.0)].index

print("Number of businesses with 3.5-4.5 stars: ", len(business_ids_similar_stars.unique()))

Number of businesses in subset:  142
Number of businesses with 3.5-4.5 stars:  57


In [18]:
bus = df[df.business_id.isin(set(business_ids_similar_stars[:50]))]

In [19]:
brandlist = pd.read_csv('../workspace/data/wordnet_food_beverages_list.csv', header=None, names=['word'])

In [20]:
reviews = bus
business_ids_similar_stars = bus.business_id.unique()

In [54]:
preprocess(reviews, brandlist, sample_size=500, verbose=1)

==> GENERATING DATASETS FOR TRAINING YOUR MODEL
   ===> CONVERTING DATA FOR SPACY
LENGTH OF DATASET:  1000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


   ===> SPLITTING INTO TRAIN/VALIDATION/TEST SETS
==> DATASETS GENERATED


(                  review_id  \
 873  2ZpJNXaqYUGoRkkkoQ49ZQ   
 87   r-_e7xOO3Vm8icEP-xtThg   
 97   Usv0AqMza-a2r-YX9LJvVg   
 819  2s1_hTX2lexfHFlKvXEC-g   
 894  0eW0QCLr79rriOUCWTUztg   
 ..                      ...   
 603  fnfShJqwlsej62jYSOPbKA   
 168  axy5zMbLpptJKQPKWkOuww   
 66   O6yqb3xm1CIRXIw5poOnKw   
 477  X_e96eVHnZRz1NWNq7OjwQ   
 932  DUQn774cdjhDJRcrpRuLaw   
 
                                                   text  \
 873  I am pretty happy with the service we received...   
 87   Beer is served perfectly and the band is amazi...   
 97   -Great food\n-Great staff\n-The family photo b...   
 819  Tasty bagels, sandwiches and desserts.  Kind o...   
 894  This is one of my favorite hotel in Las Vegas....   
 ..                                                 ...   
 603  Killer atmosphere, great staff and even better...   
 168  The nachos were HUGE they were pretty good. Th...   
 66   Loved this buffet!  This is perhaps one of the...   
 477  Food was excellent

In [21]:
business_ids_similar_stars = reviews.business_id.unique()

In [56]:
# Train spacy model
main_train()

==> TRAINING YOUR SPACY MODEL!
   ==> CONFIGURING FORMAT FOR SPACY TRAINING
   ==> TRAINING...
Loaded model 'en_core_web_sm'
Total training time: 27.65606999397278
Entities in 'Probably the best ramen in town. The food is amazing and consistent. If you want solid Japanese Ramen then this is your place. The only downside is the dining area is small so hit it during off hours or be prepared to wait.  Well worth it.'
PRODUCT food
NORP Japanese
PRODUCT place
TIME off hours
Entities in 'Great food, great price, and amazing service. Pot roast fries are amazing!! I had the belgium waffle this morning and it was the best waffle I've ever had. The burgers are awesome. I literally love everything on the menu.'
PRODUCT food
PRODUCT price
PRODUCT service
PRODUCT fries
TIME this morning
PRODUCT menu
Entities in 'Our food was on point. The crab legs were  boiling hot and ready to eat...they already come cracked or sliced in half so no work needed. The prime rib lacked a deep flavored profile. Now fo

In [22]:
# Load spacy model
nlp = spacy.load('..\workspace\models\er_model')

In [None]:
from scipy.stats import spearmanr

correlation_scores = []

# Initialize predictor
predictor = Predictor()

for bus_id in tqdm(business_ids_similar_stars):
    print("Running on restaurant ", bus_id, "...")
    subset = bus[bus.business_id == bus_id]
    
    # only get reviews with enough amount of text
    reviews_subset = [review for review in subset.text if len(review) < 400]

    print("Number of Reviews left after subset length: ", len(reviews_subset))
    
    # get set of entities for this particular restaurant,
    # and count how many reviews each entity have
    entities_with_count = defaultdict(int) 
    review_entities = [] # extract entities for each review
    print("Extracting entities from each review...")
    for review in reviews_subset:
        entities = get_entities(nlp, review)

        # add this review as a count to an entity
        for ent in entities:
            entities_with_count[ent.lower()] += 1

        review_entities.append(entities)
        
    # only grab entities that have enough reviews
    print("Filtering entities to have enough reviews...")
    entities_with_enough_reviews = []
    threshold = 30
    for key, value in entities_with_count.items():
        if value >= threshold:
            entities_with_enough_reviews.append(key)
            
    # TRUE RANKINGS CALCULATION
    # for each entity, average ratings
    true_rankings = defaultdict(list)

    print("Calculating Yelp Star Rankings... ")
    for entity in entities_with_enough_reviews:
        true_rankings['entity'] += [entity]
        entity_reviews = subset[subset.text.str.contains(entity, case=False)]
        true_rankings['average_stars'] += [np.mean(entity_reviews.stars)]

    true_rankings = pd.DataFrame(true_rankings)
    
    # PREDICTION RANKING CALCULATION
    print("Calculating Prediction Rankings...")
    # Filter entities of each review to be from the entities_with_enough_review set
    entity_filter = set(entities_with_enough_reviews)

    filtered_entities = []

    for entities in review_entities:
        filtered = []
        for ent in entities:
            ent = ent.lower()
            if ent in entity_filter:
                filtered.append(ent)
        filtered_entities.append(filtered)
    
    # perform sentiment analysis for each review with filtered entities above
    predicted_scores = defaultdict(list)

    print("Performing sentiment analysis for each review... ")
    for i, review in enumerate(reviews_subset):
        scores = predictor.customPredict(review, 9)
        # save results 
        for entity, score in scores[0]:
            predicted_scores[entity] += [score]

    # create rankings from scores
    predicted_rankings = defaultdict(list)
    for entity, scores in predicted_scores.items():
        predicted_rankings['entity'] += [entity]
        predicted_rankings['predicted_score'] += [np.mean(scores)]
    print(predicted_rankings)
    predicted_rankings = pd.DataFrame(predicted_rankings)
    #### may not be necessary to do these castings
    predicted_rankings['entity'] = predicted_rankings['entity'].astype(str)
    true_rankings['entity'] = true_rankings['entity'].astype(str)
    ####
    
    full_rankings = true_rankings.merge(predicted_rankings, how='left').fillna(0)

    # spearman correlation metric
    print("Rankings result: ")
    print(full_rankings)
    
    corr, pvalue = spearmanr(full_rankings.average_stars, full_rankings.predicted_score)
    print("Spearman Correlation Score: ", corr)
    correlation_scores.append(corr)


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Running on restaurant  d_L-rfS1vT3JMzgCUGtiow ...
Number of Reviews left after subset length:  679
Extracting entities from each review...
Filtering entities to have enough reviews...
Calculating Yelp Star Rankings... 
Calculating Prediction Rankings...
Performing sentiment analysis for each review... 
defaultdict(<class 'list'>, {'entity': ['chef', 'planet', 'food', 'service', 'chicken', 'quesadilla', 'bit', 'price', 'dishes', 'salad', 'buffet', 'review', 'Food', 'deal', 'menu', 'fritters', 'skinny', 'sauce', 'meal', 'quesadillas', 'burger', 'bottom', 'nachos', 'guacamole', 'Service', 'Guac', 'Leche', 'restaurant', 'dish', 'taste', 'crab', 'ceviche', 'platter', 'mood', 'coconut', 'plate', 'place', 'downstairs', 'steak', 'bacon', 'soupy', 'Rice', 'slaw', 'sangria', 'salsa', 'fish', 'skirt', 'chips', 'meat', 'shrimp', 'Guacamole', 'spicy', 'breakfast', 'margarita', 'dinner', 'foodie', 'pepper', 'mole', 'brunch', 'sweet', 'chile', 'salsas', 'cream', 'black', 'order', 'suffers', 'staff', 

Filtering entities to have enough reviews...
Calculating Yelp Star Rankings... 
Calculating Prediction Rankings...
Performing sentiment analysis for each review... 
defaultdict(<class 'list'>, {'entity': ['peppers', 'steak', 'cheese', 'Service', 'tuna', 'corn', 'place', 'decor', 'food', 'bread', 'service', 'restaurant', 'price', 'sandwich', 'Brookie', 'fast', 'breakfast', 'boyfriend', 'chicken', 'salmon', 'staff', 'atmosphere', 'salad', 'Backyard', 'meal', 'burger', 'diner', 'menu', 'Food', 'bit', 'cream', 'friend', 'dinner', '$', 'neighborhood', 'freshly', 'pizza', 'prices', 'starters', 'calamari', 'dessert', 'table', 'pumpkin', 'dish', 'busser', 'brunch', 'problem', 'portion', 'run', 'side', 'busboy', 'quinoa', 'lunch', 'Enjoy', 'broccolini', 'Atmosphere', 'sprouts', 'mac', 'filet', 'meatloaf', 'booze', 'brussel', 'crispy', 'seasonal', 'office', 'Young', 'Salad', 'brown', 'potato', 'mash', 'cauliflower', 'box', 'pictures', 'meatball', 'beer', 'chef', 'toast', 'hash', 'plethora', 'pla

Filtering entities to have enough reviews...
Calculating Yelp Star Rankings... 
Calculating Prediction Rankings...
Performing sentiment analysis for each review... 
defaultdict(<class 'list'>, {'entity': ['buffet', 'crab', 'place', 'RIB', 'food', 'Service', 'cocktail', 'concentrate', 'machine', 'juice', 'cheese', 'chicken', 'pizza', 'price', 'sashimi', 'cookies', 'gelato', 'sweet', 'breakfast', 'juices', 'chance', 'bargain', 'dinner', 'meat', 'Food', 'beer', 'entree', 'dessert', 'part', 'plate', 'service', 'coupon', 'cuisine', 'brunch', 'choice', 'bartender', 'seafood', 'staff', 'freezer', 'oatmeal', '$', 'Breakfast', 'meal', 'front', 'restaurant', 'places', 'lunch', 'point', 'smoked', 'salmon', 'line', 'chinese', 'pork', 'inclusive', 'freshness', 'endless', 'pasta', 'donuts', 'mimosas', 'decor', 'Nice', 'weekend', 'tix4tonight', 'sauce', 'desert', 'coctail', 'Buffet', 'spectrum', 'mediocre', 'extra', 'froyo', 'machines', 'bagel', 'sandwich', 'deal', 'weekends', 'salad', 'Price', 'rien

In [None]:
np.mean(correlation_scores)

In [None]:
std_bus = []
for bus_id in tqdm(business_ids_similar_stars):
    print("Running on restaurant ", bus_id, "...")
    subset = bus[bus.business_id == bus_id]
    
    # only get reviews with enough amount of text
    reviews_subset = [review for review in subset.text if len(review) < 400]

    print("Number of Reviews left after subset length: ", len(reviews_subset))
    
    # get set of entities for this particular restaurant,
    # and count how many reviews each entity have
    entities_with_count = defaultdict(int) 
    review_entities = [] # extract entities for each review
    print("Extracting entities from each review...")
    for review in tqdm(reviews_subset):
        entities = get_entities(nlp, review)

        # add this review as a count to an entity
        for ent in entities:
            entities_with_count[ent.lower()] += 1

        review_entities.append(entities)
        
    # only grab entities that have enough reviews
    print("Filtering entities to have enough reviews...")
    entities_with_enough_reviews = []
    threshold = 30
    for key, value in entities_with_count.items():
        if value >= threshold:
            entities_with_enough_reviews.append(key)
            
    # TRUE RANKINGS CALCULATION
    # for each entity, average ratings
    true_rankings = defaultdict(list)

    print("Calculating Yelp Star Rankings... ")
    for entity in entities_with_enough_reviews:
        true_rankings['entity'] += [entity]
        entity_reviews = subset[subset.text.str.contains(entity, case=False)]
        true_rankings['average_stars'] += [np.mean(entity_reviews.stars)]

    true_rankings = pd.DataFrame(true_rankings)
    std_bus.append(np.std(true_rankings.average_stars))

In [None]:
len(std_bus)

In [None]:
plt.hist(std_bus)

In [None]:
boolean =std_bus > np.mean(std_bus) 

In [None]:
from itertools import compress
np.mean(list(compress(correlation_scores, boolean)))

In [None]:
correlation_scores

In [None]:
for stars in full_rankings.average_stars:
    print(np.std(stars))

In [None]:
reviews_subset[1023]

In [None]:
correlation_scores

In [None]:
print("Final Correlation Score: ", np.mean(correlation_scores))