In [1]:
import os
import json
from urllib.request import urlopen
from fairseq.models.roberta import RobertaModel
from statistics import median
import time
from torch.utils.data import DataLoader

In [2]:
ROBERTA_DIR = '../roberta.large.mnli'
assert os.path.exists(f'{ROBERTA_DIR}/model.pt')
roberta = RobertaModel.from_pretrained(ROBERTA_DIR, checkpoint_file='model.pt')
try:
  roberta.cuda()
except:
  print('cuda not supported in your platform')
roberta.eval()  # disable dropout (or leave in train mode to finetune)

def similarity(text, class_name):
  tokens = roberta.encode(text, f'this document is about {class_name.lower()}')
  logits = roberta.predict('mnli', tokens[:512], return_logits=True)
  probabilities = logits.softmax(dim=-1).tolist()[0]
  entailment_probability = probabilities[2]
  return entailment_probability
    

cuda not supported in your platform


In [3]:
caches = dict()

class Node:
  def __init__(self, name, dic, parent, depth):
    self.name = name
    self.dic = dic
    self.parent = parent
    self.depth = depth
    self.cache = dict()

  def full_path(self):
    path = [self.name]
    cursor = self.parent
    while cursor is not None:
      path.append(cursor.name)
      cursor = cursor.parent
    return reversed(path)

  def children(self):
    return [Node(k, v, self, self.depth + 1) for k, v in self.dic.items()]
  
  def selected_children(self, doc):
    return sorted(self.children(), key=lambda c: c.similarity(doc), reverse=True)[:(self.depth + 2)]

  def similarity(self, text):
    cache = caches.get(text, dict())
    if text not in caches:
      caches[text] = cache
    if self.name not in cache:
      cache[self.name] = similarity(text, self.name)
    return cache[self.name]
  
  def path_score(self, doc):
    if self.parent is None:
      return 1
    
    return self.parent.path_score(doc) * self.similarity(doc)

  def confidence(self, text):
    competitors = [self.parent] + self.parent.children()
    return self.similarity(text) - max([n.similarity(text) for n in competitors])

  def confidence_threshold(self, all_documents):
    return median([self.confidence(doc.text) for doc in all_documents if doc.tagged_with(self.name)])


def flatten(list_of_lists):
  return [item for l in list_of_lists for item in l]

def aggregate_children(children_list, doc):
  children = flatten(children_list)
  if not children:
    return children
  depth = children[0].depth
  return sorted(children, key=lambda n: n.path_score(doc), reverse=True)[:((depth + 1) ** 2)]

def deeper_nodes(nodes, doc):
  children_list = [n.selected_children(doc) for n in nodes]
  return aggregate_children(children_list, doc)

def get_candidates(doc, tree):
  root = Node('root', tree, None, 0)

  depth1 = root.selected_children(doc)

  candidates = []
  nodes = depth1

  while nodes:
    candidates = candidates + nodes
    nodes = deeper_nodes(nodes, doc)

  return candidates

class Doc:
  def __init__(self, review, tree):
    text = review['reviewText']
    self.text = text
    self.review = review
    self.candidates = get_candidates(text, tree)
    self.class_names = {n.name for n in self.candidates}

  def tagged_with(self, name):
    return name in self.class_names

  def core_classes(self, all_documents):
    return [n.full_path() for n in self.candidates if n.name and n.confidence(self.text) >= n.confidence_threshold(all_documents)]


In [4]:
AMAZON_DATA_DIR = '../data/amazon'

def get_reviews(filename = 'train-1000.jsonl'):
  with open(f'{AMAZON_DATA_DIR}/{filename}') as reviewsFile:
    for line in reviewsFile:
      yield line.strip()

def get_documents(reviews):
  with open(f'{AMAZON_DATA_DIR}/taxonomy.json') as f:
    tree = json.load(f)

    num_workers = 8
    batch_size = int(len(reviews) / num_workers)
    data_loader = DataLoader(dataset=reviews, num_workers=num_workers, batch_size=batch_size)

    for review_batch in data_loader:
      for index, r in enumerate(review_batch):
        start = time.time()
        yield Doc(json.loads(r), tree)
        end = time.time()
        print(f'{index + 1} out of {len(reviews)} complete taking {end - start} seconds')

if __name__ == '__main__':
  reviews = list(get_reviews())[:10]
  all_documents = list(get_documents(reviews))
  with open(f'{AMAZON_DATA_DIR}/train-with-core-class-1000.jsonl', 'w') as f:
    for doc in all_documents:
      doc.review['core_classes'] = doc.core_classes(all_documents)
      f.write(json.dumps(doc.review) + '\n')


1 out of 10 complete taking 31.082374811172485 seconds
