In [9]:
# Libraries 

import spacy
from nltk import sent_tokenize

import pickle
import pandas as pd
from tqdm.notebook import tqdm

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [13]:
# Functions 

def extract_triples(sentence, nlp1, visualize = False): 
    
    doc = nlp1(sentence)

    if visualize:
        colors = {'subj': "#59ba41", 'rel':"#85C1E1", 'obj':'#e98686', 'mod':'#e7e705'}
        options = {"ents": ['subj', 'obj', 'rel', 'mod'], "colors": colors} 
        spacy.displacy.render(doc, style = "ent", jupyter = True, options = options)
        
    entities = {ent.text:ent.label_ for ent in doc.ents}
    
    return entities

def full_triple_parsing(sentence, nlp1): 
    
    first_sentence = sentence
    annotations = []
    while True: 
        
        triples = extract_triples(sentence, nlp1, False)
        if all([tag in list(triples.values()) for tag in ['subj', 'rel', 'obj']]): 
            
            if len(triples) >= 3:
                
                reverse_dict = {v:k for k,v in triples.items()}
                annotations.append(reverse_dict)
                sentence = reverse_dict['obj']
                
            else:
                break
        else:
            break
            
    result = {'sentence':first_sentence, 'annotations':annotations}  
    
    return result 

def tokenize_goldstein(examples): 
    
    return goldstein_tokenizer(examples["text"], padding="max_length", truncation=True) 

def predict_goldstein(text, trainer): 
    
    df = pd.DataFrame({'text':text})
    
    dataset = Dataset.from_pandas(df,preserve_index=False) 
    
    tokenized_datasets = dataset.map(tokenize_goldstein)
    
    raw_pred, _, _ = trainer.predict(tokenized_datasets) 
    
    return raw_pred

In [None]:
# Models 

# spacy.require_gpu()

# Load triple extraction model 
triple_parser = spacy.load(r"models/triple_parsing_model/model-best") 

# Load regression model 
# Get tokenizer 
goldstein_tokenizer = AutoTokenizer.from_pretrained('tokenizer2/')

# Get model 
goldstein_model = AutoModelForSequenceClassification.from_pretrained('model2/', num_labels = 1)
golstein_trainer = Trainer(model = goldstein_model)

In [6]:
# Input your sample text 
text = ':"The concept and quality of the master plan makes this an exciting new addition, not only to the Eastern Province but also to the Kingdom of Saudi Arabia."'

# Test Triple Extraction Model

In [7]:
triples = extract_triples(text, triple_parser, visualize = True)

# Test Goldstein Regression Model

In [16]:
goldstein_score = predict_goldstein([text], golstein_trainer)
goldstein_score

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

array([[3.0098212]], dtype=float32)