# Code to run spacy training. 

Change data path to run.

In [0]:
import ast 
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import time
import random
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import time

In [0]:
def create_train_data(df):
  train_data = []
  newnlp = spacy.load("en_core_web_sm")

  for i in range(len(df)):
    doc = newnlp(df['text'].iloc[i])
    entity_list = df['entities_clean'].iloc[i]
    for ent in doc.ents:
      entity_list.append((ent.start_char, ent.end_char, ent.label_))
    entity_dict = {"entities": entity_list}
    train_data.append((df['text'].iloc[i], entity_dict))
  return train_data

In [0]:
def create_test_data(df):
  test_data = []
  newnlp = spacy.load("en_core_web_sm")

  for i in range(len(df)):
    doc = newnlp(df['text'].iloc[i])
    entity_list = df['entities_clean'].iloc[i]
    for ent in doc.ents:
      entity_list.append((ent.start_char, ent.end_char, ent.label_))
    entity_dict = {"entities": entity_list}
    test_data.append((df['text'].iloc[i], entity_dict))
  return test_data

In [0]:
# new entity label
def train(train_data=TRAIN_DATA, test_data=TEST_DATA, model='en_core_web_sm', new_model_name="product", output_dir='/content/drive/My Drive/ermodel', n_iter=1,verbose=True):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch the examples using spaCy's minibatch
        start = time.time()
        for itn in range(n_iter):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)            
            #print("Training Recall:",nlp.evaluate(random.sample(TRAIN_DATA,200)).ents_r)
            #print("Test Recall:",nlp.evaluate(TEST_DATA).ents_p) #COMMENT: isn't this precision?
            #COMMENT: so test data here is evaluating test_data which has the format 
            # of e.g. ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}) right
            #print("Training Losses", losses)
        end = time.time()
    print("Total training time:",end-start)

    # test the trained model (small sample test)
    if verbose==True:
      for i in range(10):
        print('Sample predictions from 10 reviews in test set:')
        test_text = test_data[i][0]
        doc = nlp(test_text)
        print("Entities in '%s'" % test_text)
        for ent in doc.ents:
            print(ent.label_, ent.text)

    # COMMENT: Abstract to another function
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # COMMENT: Abstract to another function 
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
    return nlp

Loaded model 'en_core_web_sm'
Total training time: 169.23812222480774
Entities in 'new korean restaurant in north york neighbourhood. they offers topokki (rice cake) in three kinds sauces, red (spicy), white (creamy), and brown (korean bbq sauce). they also have few appetizers, and bingsoo (shaved ice).

we order brown topokki in brown sauce with bulgolgi, with added cheese (for additional $2). the taste is not too spectacular but decent, like other korean restaurants nearby. but i like the innovation of the restaurant, a good combination of traditional korean food, snacks, and bingsoo. it has something for everyone. the decoration is very modern and bright, lots of seats. service is very attentive, and they give you a feedback form to fill out after meal.

i would recommend this place to others.'
PRODUCT rice
PRODUCT cake
CARDINAL three
PRODUCT white
PRODUCT sauce
PRODUCT ice
PRODUCT sauce
PRODUCT bulgolgi
PRODUCT cheese
MONEY additional $2
PRODUCT taste
PRODUCT food
PRODUCT meal
Enti

In [0]:
def main(datapath='/content/drive/My Drive/spacy_train_clean.csv',output_dir='/content/drive/My Drive/ermodel', verbose=True)
  df = pd.read_csv(datapath)
  df['entities_clean']=[ast.literal_eval(i) for i in df['entities_clean']]
  train_df, test_df = train_test_split(df, test_size = .2)
  LABEL = "PRODUCT"
  TRAIN_DATA = create_train_data(train_df)
  TEST_DATA = create_test_data(test_df)
  model = train(TRAIN_DATA,TEST_DATA,output_dir=output_dir,verbose=verbose)
  return model

# Code to Validate Spacy Model

In [None]:
# -*- coding: utf-8 -*-
"""spacy_validate.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1KTr0oUxy27VOldpmjxfs-zf5nGwIwmaf
"""
from __future__ import unicode_literals, print_function
import ast 
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import time
import random
from sklearn.model_selection import train_test_split
import pandas as pd

def create_train_data(df):
  train_data = []
  newnlp = spacy.load("en_core_web_sm")

  for i in range(len(df)):
    doc = newnlp(df['text'].iloc[i])
    entity_list = df['entities_clean'].iloc[i]
    for ent in doc.ents:
      entity_list.append((ent.start_char, ent.end_char, ent.label_))
    entity_dict = {"entities": entity_list}
    train_data.append((df['text'].iloc[i], entity_dict))
  return train_data

def create_test_data(df):
  test_data = []
  newnlp = spacy.load("en_core_web_sm")

  for i in range(len(df)):
    doc = newnlp(df['text'].iloc[i])
    entity_list = df['entities_clean'].iloc[i]
    for ent in doc.ents:
      entity_list.append((ent.start_char, ent.end_char, ent.label_))
    entity_dict = {"entities": entity_list}
    test_data.append((df['text'].iloc[i], entity_dict))
  return test_data

def create_masked_train_data(df, masked_entities):
  train_data = []
  newnlp = spacy.load("en_core_web_sm")
  
  for i in range(len(df)):
    doc = newnlp(df['text'].iloc[i])
    entity_list = df['entities_clean'].iloc[i]
    for ent in doc.ents:
      if ent.text not in masked_entities:
        entity_list.append((ent.start_char, ent.end_char, ent.label_))
    entity_dict = {"entities": entity_list}
    train_data.append((df['text'].iloc[i], entity_dict))
  return train_data

def masked_train_test(train, test):
  brand_list = []
  for (index,entity_loc) in enumerate(train['entities_clean']):
    text = train['text'].iloc[index]
    for pair in entity_loc:
      brand_list.append(text[pair[0]:pair[1]])
      
  import numpy as np
  unique_brands = np.unique(brand_list)

  newbrand_list = []
  for (index, entity_loc) in enumerate(test['entities_clean']):
    text = test['text'].iloc[index]
    for pair in entity_loc:
      newbrand_list.append(text[pair[0]:pair[1]])
      
  import numpy as np
  newunique_brands = np.unique(newbrand_list)

  in_common = list(set(unique_brands) & set(newunique_brands))
  print("Total in common:",len(in_common))

  masked_entities, unmasked_entities = train_test_split(in_common, test_size = .5)
  print("Total masked:", len(masked_entities))

  # new entity label
  TRAIN_DATA = create_masked_train_data(train, masked_entities)
  TEST_DATA = create_test_data(test)
  return TRAIN_DATA, TEST_DATA, masked_entities, unique_brands, newunique_brands

def evaluate_novelty(trained_model, masked_train_data, masked_test_data, masked_entities, unmasked_train_data, unmasked_test_data):
  nomask_true = {}
  nomask = {}

  for review in unmasked_test_data:
    test_ents_true = [review[0][start:end] for (start, end, label) in review[1]['entities']]
    doc = trained_model(review[0])
    test_ents = [ent.text for ent in doc.ents]

    for entity in masked_entities:
      if entity in test_ents_true: 
        if (entity in test_ents):
          if entity in nomask.keys():
            nomask[entity] += 1
            nomask_true[entity] +=1
          else: nomask_true[entity] = 0; nomask[entity]=0
        elif entity in nomask_true.keys(): nomask_true[entity]+=1
        else: nomask_true[entity] = 0

  mask_true = {}
  mask = {}

  for review in masked_test_data:
    test_ents_true = [review[0][start:end] for (start, end, label) in review[1]['entities']]
    doc = trained_model(review[0])
    test_ents = [ent.text for ent in doc.ents]

    for entity in masked_entities:
      if entity in test_ents_true: 
        if (entity in test_ents):
          if entity in mask.keys():
            mask[entity] += 1
            mask_true[entity] +=1
          else: mask_true[entity] = 0; mask[entity]=0
        elif entity in mask_true.keys(): mask_true[entity]+=1
        else: mask_true[entity] = 0

  ratios_without_mask = {}
  for key in nomask.keys():
    if nomask_true[key] !=0:
      ratios_without_mask[key] = nomask[key]/nomask_true[key]
  ratios = {}
  for key in mask.keys():
    if mask_true[key] !=0:
      ratios[key] = mask[key]/mask_true[key]

  difference = {}
  for keys in ratios_without_mask:
    difference[keys] =  ratios[keys] - ratios_without_mask[keys]
  return difference, ratios, ratios_without_mask

def evaluate_spacy(trained_model_dir='./workspace/models/er_model', dataset_path="./workspace/data/test.csv", verbose=True):
  df = pd.read_csv(dataset_path)
  df['entities_clean']=[ast.literal_eval(i) for i in df['entities']]
  train_df, test_df = train_test_split(df, test_size = .2)
  trained_model = spacy.load(trained_model_dir)
  LABEL = "PRODUCT"
  masked_TRAIN_DATA, masked_TEST_DATA, masked_entities, unique_brands, newunique_brands = masked_train_test(train_df, test_df)

  TRAIN_DATA = create_train_data(train_df)
  TEST_DATA = create_test_data(test_df)

  difference, ratios, ratios_without_mask = evaluate_novelty(trained_model, masked_TRAIN_DATA,masked_TEST_DATA,masked_entities, TRAIN_DATA,TEST_DATA)
  if verbose == True:
    print('DIFFERENCES')
    print(difference)
    print('RATIOS WITH MASK')
    print(ratios)
    print('RATIOS WITHOUT MASK')
    print(ratios_without_mask)
  d = {'difference': difference, 'ratios with mask':ratios,'ratios without mask': ratios_without_mask}
  df = pd.DataFrame(data=d)
  return df