# RoViST-C Demo

## Setup

In [None]:
!pip install transformers
!pip install sentencepiece

Download rovist_c.py from here: [download link](https://drive.google.com/file/d/1ya9Ut3R_pXleyHQ9B7iIdkoHYYLCgvZb/view?usp=sharing) 

Download model checkpoint from here: [download link](https://drive.google.com/file/d/1-ATRk6AQyKGNDZHkqrKkpjiY6jfbK9NS/view?usp=sharing) 


In [None]:
import json
import os
import numpy as np
import pandas as pd 
import torch
import torch.nn.functional as F

import nltk
nltk.download('punkt')

from rovist_c import SOPClassifier, model_nm, tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
We are using GPU.


Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/685 [00:00<?, ?B/s]

In [None]:
### Enable GPU
use_cuda = True if torch.cuda.is_available() else False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('We are using GPU.' if use_cuda else 'We are using CPU.')

We are using GPU.


## Calculate Coherence Score

In [None]:
def coherence_score(selected_stories, cand_stories, model, device): 

    coherence_scores = [] 

    for key in selected_stories:
        
        print("Evaluating {}".format(key))

        text = cand_stories[key]
        sentences = nltk.sent_tokenize(text)

        scores = []

        for i in range(len(sentences)-1): 
            sentA = sentences[i]
            sentB = sentences[i+1]

             # repeated sentence --> automatically assign 0 score
            if sentA.strip() == sentB.strip():
              scores.append(0)
              continue 
            
            sentA_len = len(tokenizer.tokenize(sentA))
            sentB_len = len(tokenizer.tokenize(sentB))

            text_list = ['[CLS]', sentA, '[SEP]', sentB, '[SEP]']
            input_text = ' '.join(text_list)

            tokenized_text = tokenizer.tokenize(input_text)

            # token ids
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            token_ids = torch.tensor([indexed_tokens])
            
            # sequence ids
            segment_ids = (sentA_len+2)*[0] + (sentB_len+1) * [1]
            segment_ids = torch.tensor([segment_ids])

            # attention mask id 
            attention_ids = (sentA_len+2)*[1] + (sentB_len+1) * [1]
            attention_ids = torch.tensor([attention_ids])

            # if GPU is available, move data to GPU 
            token_ids = token_ids.to(device)
            attention_ids = attention_ids.to(device)
            segment_ids = segment_ids.to(device)

            with torch.no_grad():
                sop_logits = model(token_ids, attention_ids, segment_ids)

            sop_probs = F.softmax(sop_logits, dim=1)

            # probability of being coherent is stored at index 1
            scores.append(sop_probs[0][1].item())

        coherence_scores.append(np.mean(scores))
      
    return coherence_scores

Set the model_checkpoint_path to where you the model checkpoint is.

In [None]:
model_checkpoint_path = "/content/sop_model_epoch4.pth.tar"

if os.path.isfile(model_checkpoint_path): 
    print("Loading pre-trained ALBERT model.")
    checkpoint = torch.load(model_checkpoint_path)
    ckpt_opt = checkpoint["opt"]

    model = SOPClassifier(ckpt_opt.hidden_dim, ckpt_opt.dropout_prob,
                                model_nm)
    model = model.to(device) # move model to GPU

    model.load_state_dict(checkpoint["model"])
    model.eval()
else: 
    print("No checkpoint find at this path!")

Loading pre-trained ALBERT model.


Downloading:   0%|          | 0.00/68.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-large-v1 were not used when initializing AlbertModel: ['predictions.dense.weight', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Set the output_stories_path variable to where json file the machine stories are located. The machine-generated stories from the 4 VST models used in the paper can be downloaded [here](https://drive.google.com/drive/folders/1xp-x14jn9UFhaxbQ9hNpzNqpzHp275dQ?usp=sharing)

If you want to evaluate a subset of stories, change the selected_stories variable to a list of story IDs (separated by commas).

In [None]:

output_stories_path = "/content/glacnet_stories.json"
selected_stories = '49800,45531,49990,48121,45840,45595,50195,49615,45536,46191,50150,45615,47165,50041,47835,47870,45715,46050,46005,47440'
output_stories = json.load(open(output_stories_path))


if selected_stories is not None: 
    selected_stories = [x for x in selected_stories.split(',')]
else: 
    selected_stories = list(output_stories.keys()) # evaluate all stories 


### compute coherence scores
final_scores = coherence_score(selected_stories, output_stories, model, device)

### save final scores to dataframe 
df = pd.DataFrame()
df["story_id"] = selected_stories
df["coherence_score"] = final_scores
df.to_csv("coherence_scores.csv", index = False)

### system level score 
print("Coherence Score: {}".format(df["coherence_score"].mean()))

Evaluating 49800
Evaluating 45531
Evaluating 49990
Evaluating 48121
Evaluating 45840
Evaluating 45595
Evaluating 50195
Evaluating 49615
Evaluating 45536
Evaluating 46191
Evaluating 50150
Evaluating 45615
Evaluating 47165
Evaluating 50041
Evaluating 47835
Evaluating 47870
Evaluating 45715
Evaluating 46050
Evaluating 46005
Evaluating 47440
Coherence Score: 0.7278509503463283


# RoViST-NR Demo

In [None]:
def jaccard_sim(selected_stories, cand_stories, ngram_len): 
    """
    Calculate the repetition score for all output stories. 
    """
    story_rep_scores = []

    for key in selected_stories: 

        story = cand_stories[key]
        sent_tokens = nltk.sent_tokenize(story)

        word_tokens = [] 

        for sent in sent_tokens: 
            # don't include the punctuation at the end. 
            word_tokens.append(nltk.word_tokenize(sent)[:-1])

        # check inter sentence repetition. 
        inter_sentence_scores = [] 

        for i in range(1, len(word_tokens)): 
            next_ngrams = word_tokens[i]

            for j in range(0, i): 
                prev_ngrams = word_tokens[j]
                union = len(set(next_ngrams + prev_ngrams))

                intersection = 0
                for token_i in next_ngrams: 
                    for token_j in prev_ngrams: 
                        if token_i == token_j: 
                            intersection += 1

                inter_sentence_scores.append(intersection/union)

        # check intra sentence repetition
        intra_sentence_scores = [] 
        
        for tokens in word_tokens: 
            for i in range(0, len(tokens), ngram_len): 
                j = i + ngram_len
                prev_slice = tokens[i:i+ngram_len] 
                next_slice = tokens[j:j+ngram_len]
        
                if len(next_slice) == 0: continue 
                union = len(set(prev_slice + next_slice))
                
                intersection = 0
                for token_i in prev_slice: 
                    for token_j in next_slice: 
                        if token_i == token_j:
                            intersection += 1
                
                intra_sentence_scores.append(intersection/union)

        if len(intra_sentence_scores) != 0: 
            repetition_score = [np.mean(inter_sentence_scores), np.mean(intra_sentence_scores)]
        else: # sentences are way too short to do intra sentence rep 
            repetition_score = [np.mean(inter_sentence_scores), 0]

        story_rep_scores.append(1-np.mean(repetition_score))

    return story_rep_scores

Set the output_stories_path variable to where json file the machine stories are located. The machine-generated stories from the 4 VST models used in the paper can be downloaded [here](https://drive.google.com/drive/folders/1xp-x14jn9UFhaxbQ9hNpzNqpzHp275dQ?usp=sharing)

If you want to evaluate a subset of stories, change the selected_stories variable to a list of story IDs (separated by commas).

In [None]:
output_stories_path = "/content/ke_stories.json"
selected_stories = '49800,45531,49990,48121,45840,45595,50195,49615,45536,46191,50150,45615,47165,50041,47835,47870,45715,46050,46005,47440'
output_stories = json.load(open(output_stories_path))

intra_sent_ngram_len = 4

if selected_stories: 
    selected_stories = [x for x in selected_stories.split(',')]
else: 
    selected_stories = list(output_stories.keys()) # evaluate all stories 

final_scores = jaccard_sim(selected_stories, output_stories, intra_sent_ngram_len)

### save final scores to dataframe 
df = pd.DataFrame()
df["story_id"] = selected_stories
df["non_redun_score"] = final_scores
df.to_csv("non_redun_scores.csv", index = False)

### system level score 
print("Non Redundancy Score: {}".format(df["non_redun_score"].mean()))

Non Redundancy Score: 0.9993928571428572
