reference: https://github.com/CornellNLP/ConvoKit/blob/master/examples/converting_movie_corpus.ipynb

In [60]:
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict
import convokit 
import json 

In [61]:
corpus_speakers = {
    "p": Speaker(id="speaker1", meta={}), 
    "r": Speaker(id="speaker2", meta={})
}

In [62]:
with open("spolin-train.json") as f: 
    train_data = json.load(f) 
    
with open("spolin-valid.json") as f: 
    valid_data = json.load(f) 

In [72]:
train_data['yesands'].keys()

dict_keys(['spont', 'cornell', 'subtle'])

In [64]:
def get_utterances(spolin_dict, split="train"): 
    utterance_corpus = {}
    ct = 0 
    for label, dict_ in spolin_dict.items(): 
        for source, pairs in dict_.items(): 
            for pair in pairs: 
                for turn, text in pair.items(): 
                    if turn not in ["p", "r"]: 
                        continue 
                    idx = f"{split}_{ct}_{turn}"
                    conversation_id = f"{split}_{ct}"
                    meta = {"split": split, "label": 1 if label=="yesands" else 0, "source": source} 
                    utterance_corpus[idx] = Utterance(
                        id=idx, 
                        conversation_id=conversation_id,
                        speaker=corpus_speakers[turn],
                        text=pair[turn], 
                        meta=meta
                    )
                    if turn == 'p': 
                        utterance_corpus[idx].reply_to=None 
                    elif turn == 'r': 
                        utterance_corpus[idx].reply_to=idx[:-1] + "p" 
                ct += 1 
    return utterance_corpus

In [65]:
all_utterances = get_utterances(train_data)
all_utterances.update(get_utterances(valid_data, split="valid"))

In [66]:
print("Total number of utterances = {}".format(len(all_utterances)))

Total number of utterances = 225194


In [67]:
utterance_list = all_utterances.values()

In [68]:
spolin_corpus = Corpus(utterances=utterance_list)

In [69]:
print("number of conversations in the dataset = {}".format(len(spolin_corpus.get_conversation_ids())))

number of conversations in the dataset = 112597


In [70]:
spolin_corpus.meta.update({
    "name": "spolin", 
    "brief description": "Selected Pairs of Learnable ImprovisatioN (SPOLIN) is a collection of more than 68,000 \"Yes, and\" type dialogue pairs extracted from the Spontaneanation podcast by Paul F. Tompkins, the Cornell Movie-Dialogs Corpus, and the SubTle corpus.",
    "authors": "Hyundong Justin Cho, Jonathan May", 
    "poc_email": "jcho@isi.edu", 
    "github_url": "https://github.com/wise-east/spolin",
    "publication_title": "Grounding Conversations with Improvised Dialogues", 
    "publication_venue": "ACL2020", 
    "publication_url": "https://aclanthology.org/2020.acl-main.218/",
    "license": "Creative Commons Attribution-NonCommercial 4.0 International License", 
})

In [77]:
spolin_corpus.dump("spolin_corpus")

In [76]:
corpus=Corpus(convokit.download("spolin_corpus"))

Downloading spolin_corpus to /Users/jcho/.convokit/downloads/spolin_corpus


KeyError: 'spolin_corpus'