## Notebook Overview

Explore dataset from [RELiC](https://relic.cs.umass.edu/), a novel information retrieval task.

### Extract Descriptive Passages

In [11]:
import json
import pandas as pd

# POS
import spacy

# nltk for wordnet and tokenization
import nltk
from nltk.corpus import wordnet as wn
from nltk import sent_tokenize
from nltk import word_tokenize

In [129]:
pd.set_option('display.max_colwidth', None)

In [3]:
'''
Read in .json of RELiC data as a nested dict.
Data structured as follows:

{
    book_n: {
        quotes: {
            quote_id: [
                [quote_n_left] # 4 sentences, left 'context' of critical claim
                sentence_id # index of the corresponding sentence in `sentences`
                sentence_window_size # number from 1-5 =to how many original sentences inhere the claim
                [quote_n_right] # 4 sentences, right 'context' of critical claim
            ], ...
        },
        sentences: [sentence_1, sentence_2], # all the sentences in the work
        candidates: { # tracks which sentences are eligible to be 'expanded' e.g. if sentence #7 is used in a claim that's 3 sentences long, we should retrieve sentences 7,8,9.
            1_sentence: [all sentence_id's],
            ...,
            5-sentence: [all but last 4 sentence_ids (prevent OOB error)], 
        }
    },
}
Return a dict containing requisite fields
'''
def read_data(filename):
    
    f = open(filename)
    data = json.load(f)
    
    return data
    

In [5]:
train_data = read_data('data/relic-train.json')

In [4]:
val_data = read_data('data/relic-val.json')

In [8]:
relic_data = z = {**train_data, **val_data}

In [9]:
# what books are in the dataset:
for i in enumerate(relic_data):
    print(i)

(0, 'brothers_karamazov')
(1, 'to_the_lighthouse')
(2, 'the_pickwick_papers')
(3, 'david_copperfield')
(4, 'animal_farm')
(5, 'the_scarlet_letter')
(6, 'a_portrait_of_the_artist_as_a_young_man')
(7, 'the_turn_of_the_screw')
(8, 'the_souls_of_black_folk')
(9, 'adam_bede')
(10, 'sense_and_sensibility')
(11, 'martin_chuzzlewit')
(12, 'swanns_way')
(13, 'sister_carrie')
(14, 'daisy_miller')
(15, 'o_pioneers')
(16, 'the_red_badge_of_courage')
(17, 'little_dorrit')
(18, 'great_expectations')
(19, 'the_call_of_the_wild')
(20, 'mrs_dalloway')
(21, 'the_sport_of_the_gods')
(22, 'middlemarch')
(23, 'alices_adventures_in_wonderland')
(24, 'jacobs_room')
(25, '1984')
(26, 'house_of_mirth')
(27, 'nicholas_nickleby')
(28, 'moby_dick')
(29, 'oliver_twist')
(30, 'jane_eyre')
(31, 'this_side_of_paradise')
(32, 'madame_bovary')
(33, 'iola_leroy')
(34, 'frankenstein')
(35, 'the_age_of_innocence')
(36, 'lady_chatterlys_lover')
(37, 'maggie_a_girl_of_the_streets')
(38, 'wuthering_heights')
(39, 'the_ambass

In [66]:
'''
Helper for determining if a list of sentences contains dialogue offsets (quote marks, -'s for Joyce, single-word sentences, etc.')
'''
def dialogue_helper(sentences):
    return [d for d in sentences 
            if '"' in d or '\"' in d 
            or '“' in d or '”' in d 
            or d.startswith('-') 
            or d.strip().startswith('-') 
            or len(d.strip().split()) == 1] != []

In [82]:
'''
for a given work in the dataset, extract passages deemed by critics to be "descriptive"
based on a 'search_list' containing substrings to look for in a given passage
this method excludes passages that contain dialogue
this pessimistic sanitization should result in more readily 'descriptive' passages
return: a cleaned version of the sentence
'''
def extract_passages(data, book_title, search_list, truncate_claims, exclude_dialogue):
    # get associated data for a single title
    book_data = data[book_title]
    
    quotes = book_data["quotes"]
    sentences = book_data["sentences"]
    
    descriptive_sentences = {}
    sentence_ids = set()
    descriptive_count = 0
    
    for quote_id in quotes:
        sentence_id = quotes[quote_id][1]
        quote_size = quotes[quote_id][2]
        
        # ensure the quoted passage has not already been recovered
        # if part of it has already been recovered, that's okay, it will just be attached 'principally' to only 1 claim
        # this just helps us track how many fragments total are looked at by critics
        if sentence_id in sentence_ids:
            continue
        else:
            sentence_ids.add(sentence_id)
            if quote_size == 1:
                sentence_ids.add(sentence_id + 1)
            elif quote_size == 2:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
            elif quote_size == 3:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
                sentence_ids.add(sentence_id + 3)
            elif quote_size == 4:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
                sentence_ids.add(sentence_id + 3)
                sentence_ids.add(sentence_id + 4)
            else:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
                sentence_ids.add(sentence_id + 3)
                sentence_ids.add(sentence_id + 4)
                sentence_ids.add(sentence_id + 5)
        
        # like the paper, use windows of either full claim
        left_claim = ' '.join(quotes[quote_id][0]).lower()
        right_claim = ' '.join(quotes[quote_id][3]).lower()
        
        if left_claim is None or right_claim is None or left_claim == "" or right_claim == "":
            continue
        
        # or, if desired, 1 sentence on either side of the quote
        if truncate_claims:
            left_claim = sent_tokenize(left_claim)[-1]
            right_claim = sent_tokenize(right_claim)[0]
                
        # look for match on our phrases of interest
        descriptive = any(substring in left_claim for substring in search_list) or any(substring in right_claim for substring in search_list)
        
        if descriptive:
            # fix encoding issues with json conversion
            descriptive_sentence_raw = sentences[sentence_id: sentence_id+quote_size]
            descriptive_sentence = []
            for d in descriptive_sentence_raw:
                bytes_string = bytes(d, encoding="raw_unicode_escape")
                descriptive_sentence.append(bytes_string.decode("ISO-8859-1", "strict"))
            
            # ensure the quote does not contain any dialogue, and is at least longer than 1 word
            remove = False
            if exclude_dialogue:
                # check descriptive sentence
                if dialogue_helper(descriptive_sentence):
                    remove = True
            if remove:
                continue

            # strip extra whitespace and re-join multi-"sentence" passages that were split on ";", ":", "..."
            descriptive_sentence = ' '.join([x.strip() for x in descriptive_sentence])

            # track the critical claim connected to a given quote
            descriptive_sentences[str(quote_id)] = descriptive_sentence
            
            descriptive_count +=1
                
    print(f'Extracted {descriptive_count} out of {len(sentence_ids)} analyzed fragments in {book_title}.')
    return descriptive_sentences

In [84]:
descriptive_sentences = extract_passages(relic_data,
                                         'mansfield_park',
                                         ['descri', 'detail', 'zoom', 'poet', 'lyric', 'vivid', 'imag'],
                                        truncate_claims = True,
                                        exclude_dialogue = True)


Extracted 34 out of 1509 analyzed fragments in mansfield_park.


In [86]:
# fill up dict with passages, grouped by book
descriptive_passages = {}
for i in enumerate(relic_data):
    print(i[1])
    descriptive_passages[i[1]] = extract_passages(relic_data, i[1], ['descri', 'detail', 'zoom', 'poet', 'lyric', 'vivid', 'imag'], truncate_claims = False, exclude_dialogue = True)

brothers_karamazov
Extracted 136 out of 3082 analyzed fragments in brothers_karamazov.
to_the_lighthouse
Extracted 220 out of 2182 analyzed fragments in to_the_lighthouse.
the_pickwick_papers
Extracted 127 out of 1750 analyzed fragments in the_pickwick_papers.
david_copperfield
Extracted 335 out of 3291 analyzed fragments in david_copperfield.
animal_farm
Extracted 32 out of 481 analyzed fragments in animal_farm.
the_scarlet_letter
Extracted 192 out of 1801 analyzed fragments in the_scarlet_letter.
a_portrait_of_the_artist_as_a_young_man
Extracted 421 out of 2881 analyzed fragments in a_portrait_of_the_artist_as_a_young_man.
the_turn_of_the_screw
Extracted 82 out of 1383 analyzed fragments in the_turn_of_the_screw.
the_souls_of_black_folk
Extracted 51 out of 702 analyzed fragments in the_souls_of_black_folk.
adam_bede
Extracted 86 out of 1052 analyzed fragments in adam_bede.
sense_and_sensibility
Extracted 64 out of 1411 analyzed fragments in sense_and_sensibility.
martin_chuzzlewit
Ex

In [72]:
# what are the descriptive passages in the scarlet letter?
descriptive_passages['the_scarlet_letter']

{'006282': 'My imagination was a tarnished mirror. It would not reflect, or only with miserable dimness, the figures with which I did my best to people it. The characters of the narrative would not be warmed and rendered malleable by any heat that I could kindle at my intellectual forge.',
 '006288': "All this enmity and passion had Pearl inherited, by inalienable right, out of Hester's heart.",
 '006293': 'and neither of them was of one whit more avail than the twinkle of a tallow-candle. An entire class of susceptibilities, and a gift connected with them,-of no great richness or value, but the best I had,-was gone from me.',
 '006295': 'white-headed and wrinkled images, which my fancy used to sport with, and has now flung aside forever.',
 '006300': 'Certainly, there was some deep meaning in it, most worthy of interpretation, and which, as it were, streamed forth from the mystic symbol, subtly communicating itself to my sensibilities, but evading the analysis of my mind.',
 '006301':

In [120]:
'''
create an intermediate results dataframe containing the relevant, extracted sentences along with supporting metadata
'''
def make_results_df(descriptive_passages, full_data, search_list, truncate_claims):
    
    books = []
    claim_ids = []
    left_claims = []
    left_claim_keywords = []
    right_claims = []
    right_claim_keywords = []
    passage_ids = []
    passages = []
    passage_sizes = []
    
    for book, data in descriptive_passages.items():
        for k, v in data.items():
            block = full_data[book]['quotes'][k]
            # title of book
            books.append(book)
            # claim id, linking critical claim to quoted passage
            claim_ids.append(k)
            # critical claim
            left_claim = ' '.join(block[0]).lower()
            right_claim = ' '.join(block[3]).lower()
            if truncate_claims:
                left_claim = sent_tokenize(left_claim)[-1]
                right_claim = sent_tokenize(right_claim)[0]
            
            # track claim
            left_claims.append(left_claim)
            right_claims.append(right_claim)
            
            # track which claim contains which keywords
            left_claim_keywords_temp = []
            right_claim_keywords_temp = []
                    
            for w in word_tokenize(left_claim):
                for term in search_list:
                    if term in w:
                        left_claim_keywords_temp.append(w)
                        
            for w in word_tokenize(right_claim):
                for term in search_list:
                    if term in w:
                        right_claim_keywords_temp.append(w)
            
            # trtoreack keywords
            left_claim_keywords.append(left_claim_keywords_temp)
            right_claim_keywords.append(right_claim_keywords_temp)
            
            # quoted passage
            passages.append(v)
            
            # 'starting' sentence id of passage
            passage_ids.append(block[1])
            
            # 'window size' of passage (based on authors' tokenization scheme)
            passage_sizes.append(block[2])
    
    return pd.DataFrame({'passage': passages, 'book': books, 'left_claim': left_claims, 'left_claim_keywords': left_claim_keywords, 'right_claim': right_claims, 'right_claim_keywords': right_claim_keywords, 'claim_id': claim_ids, 'passage_id': passage_ids, 'passage_size': passage_sizes})


In [121]:
df = make_results_df(descriptive_passages, relic_data, ['descri', 'detail', 'zoom', 'poet', 'lyric', 'vivid', 'imag'], truncate_claims=False)

In [122]:
df.shape

(8838, 9)

In [131]:
df[df['book'] == 'emma'].head()

Unnamed: 0,passage,book,left_claim,left_claim_keywords,right_claim,right_claim_keywords,claim_id,passage_id,passage_size
7835,"Her views of improving her little friend's mind, by a great deal of useful reading and conversation, had never yet led to more than a few first chapters, and the intention of going on to-morrow. It was much easier to chat than to study; much pleasanter to let her imagination range and work at Harriet's fortune, than to be labouring to enlarge her comprehension or exercise it on sober facts; and the only literary pursuit which engaged Harriet at present, the only mental provision she was making for the evening of life, was the collecting and transcribing all the riddles of every sort that she could meet with, into a thin quarto of hot-pressed paper, made up by her friend, and ornamented with ciphers and trophies.",emma,"the list she drew up when only fourteen i remember thinking it did her judgement so much credit, that i preserved it some time; and i dare say she may have made out a very good list now. but i have done with expecting any course of steady reading from emma.' (p.37) knightley's assertions are given a substantial confirmation a little later, when we see more of emma and harriet.",[],"once again, this contradicts the notion of a strong and sustained literary influence on emma, and it directs us rather to the people and events of the novel, the difficulties they pose in the matter of imagining and understanding. for an austen heroine, emma also makes remarkably few literary allusions. apart from a reference to elegant extracts, as the source for a riddle, there are only three made by her. what is more, two of them tend to confirm her habit of not getting beyond first chapters.",[imagining],62956,1383,4
7836,"Let no name ever pass our lips. We were very wrong before; we will be cautious now.-He is your superior, no doubt, and there do seem objections and obstacles of a very serious nature;",emma,"how could you imagine such conduct practicable? paradoxically, although austen is sensitive to the idea that it is not always 'practicable' to be forthright, especially where there is an imbalance of power, she also exploits the tragi-comic possibilities of social decorum that proscribes circumlocution. emma's final and most painful misunderstanding occurs precisely because of social equivocations, which lead her to believe that harriet is in love with frank churchill rather than mr knightley. emma, resolving to herself that 'plain dealing was always best', encourages harriet to confess her new love, but adds an important codicil:",[imagine],"the misunderstandings persist as the women, with due propriety, agree upon the superior merits of the 'gentleman' in question for rendering harriet an elusive 'service': i am not at all surprised at you, harriet. the service he rendered you was enough to warm your heart.' '\nservice!",[],62959,7029,3
7837,She could now look forward to giving him that full and perfect confidence which her disposition was most ready to welcome as a duty.,emma,"on the one hand, mr. knightley is acting as a true friend, doing what is good for emma for her own sake, even at the cost of unpleasantness for both her and himself. at the same time, he wants to avoid the remembrance of neglect; he is concerned about what she will feel for him in the future. that marriage imposes a sense of duty on friendship-and that, by making clear what is owed by each person, it acknowledges that each person desires something from the other-is not shown to be a burden on love but rather a support.",[],"it is not onerous to her to feel that she owes him something (we have seen elsewhere her ability to be grateful). aristotle may help us understand austen's suggestion that issues of justice (such as the duties and legal agreement of marriage) are not an affront to love, as for example when he argues that when people have nothing in common ''there is no friendship, as there is nothing just. ""'26\nthe love austen describes can be fulfilled in marriage in part because it is based on real commonality or suitability. marital vows simply will make explicit the aspect of love that austen has always stressed-not the feeling of affection per se, but the act of choosing a friend for life.",[describes],62963,9631,1
7838,a man does not imagine any such thing.,emma,thus: emma: a man always imagines a woman to be ready for anybody who asks her. mr knightley:,[imagines],"or again, of emma's friend harriet smith. emma: the sphere in which she moves is much above his. it would be a degradation.",[],62967,1202,1
7839,"of her being taught only what very limited means could command, and growing up with no advantages of connexion or improvement, to be engrafted on what nature had given her in a pleasing person, good understanding, and warm-hearted, well-meaning relations.",emma,"both are rich, spoiled, supremely imaginative, charming, good-looking, entirely unrestrained in their games, and potentially harmful to people with less power than themselves. both know intuitively that, whatever they do, they will not come out losers. the only lives that frank and emma can ruin are other people's. they themselves are blessed with every advantage, including a happy disposition and a never-failing ability to be pleased with themselves.",[imaginative],"like elizabeth bennet, jane must endure the constant company of well-meaning but socially despised and ludicrous relatives. like fanny price, she must endure silently while a flirt of high social standing plays with the man she loves. she also displays the diffidence necessary to her social position, a reserve that makes her unattractive to so many readers. however, all of austen's heroines display this diffidence, reserve, and iron self-control when they find themselves in a position of dependence or powerlessness, as well as when they are with people with whom they have nothing in common.",[],62972,3139,1


In [132]:
# grab specific cols for Emma
df[df['book'] == 'emma'][['passage', 'left_claim', 'left_claim_keywords', 'right_claim', 'right_claim_keywords']]['passage'][:10]

7835    Her views of improving her little friend's mind, by a great deal of useful reading and conversation, had never yet led to more than a few first chapters, and the intention of going on to-morrow. It was much easier to chat than to study; much pleasanter to let her imagination range and work at Harriet's fortune, than to be labouring to enlarge her comprehension or exercise it on sober facts; and the only literary pursuit which engaged Harriet at present, the only mental provision she was making for the evening of life, was the collecting and transcribing all the riddles of every sort that she could meet with, into a thin quarto of hot-pressed paper, made up by her friend, and ornamented with ciphers and trophies.
7836                                                                                                                                                                                                                                                                          

### Analyze Critical Claims

* what keywords
* what types of passages
* what are they saying
* which passages are selected repeatedly
* where in works are passages
* do claims require 'context' per se

### Analyze Descriptive Passages

* parts of speech based
* time series
* topic modeling
* neural

#### Parts of Speech Based Analysis

In [148]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f8ee153a0a0>)

In [149]:
#Function from http://www.nltk.org/howto/wordnet.html to get *all* of a synset's hyponym/hypernyms
hyper = lambda s: s.hypernyms()

In [202]:
#Define function to count the number of hypernyms for each noun and verb
def specificity_nelson(x):
    x = x.replace('[\x00-\x1f]'," ")
    text = word_tokenize(x)
    total_list = []
    for w in text:
        if not wn.synsets(w):
            pass
        else:
            synset = wn.synsets(w)
            #limit to nouns and verbs, as other words are not arranged hierarchically
            if ((synset[0].pos() == (wn.NOUN)) or (synset[0].pos() == (wn.VERB))):
                #I assume the most popular definition of each word.
                paths = synset[0].hypernym_paths()
                a_path = []
                for num in range(0,len(paths)):
                    a_path.append(len([synset.name for synset in paths[num]]))
                    #I am taking the path with the minimum number of hypernyms, but this could be calculated some other way.
                    path_num = min(a_path)
                total_list.append( (w, path_num) )
    return total_list

In [228]:
'''
Consult wordnet for the situation of a noun and verb with respect to its station in the hypernym hierarchy. 
Based on current SOA, it is acceptable to simply grab the top-level (.01) synset.

Args:
    tagged_sample: a spacy doc

Return:
    specificity: a value conveying the "specificity" of the input, via Nelson (2020)
'''

def specificity(sample):
    tagged_sample=nlp(sample)
    hyper_sum = 0
    noun_and_verb_count = 0
    for word in tagged_sample:
        if not wn.synsets(word.lemma_):
            continue
        else:
            if word.pos_ == "NOUN" or word.pos_ == "VERB":
                noun_and_verb_count +=1
                # if it's a verb, get the most common verb hypernym chain
                # else, get the most common noun hypernym chain
                pos = word.pos_
                tag = "n" if pos.startswith("N") else "v"
                synset = word.lemma_ + "." + tag + ".01"
                hyper_sum += len(list(wn.synset(synset).closure(hyper)))
    
    return hyper_sum / noun_and_verb_count

In [229]:
specificity_nelson('The quick red fox jumped over the brown lazy dog.')

[('quick', 7),
 ('red', 8),
 ('fox', 14),
 ('jumped', 2),
 ('over', 9),
 ('brown', 8),
 ('dog', 9)]

In [230]:
(7 + 8 + 14 + 2 + 9 + 8 + 9) / 7

8.142857142857142

In [235]:
specificity('The quick red fox jumped over the brown lazy dog.')

9.333333333333334

In [None]:
# parts of speech work
# spaCy on each description
# column view a la Bal, Tenen
# specificity (Nelson 2020)
# descriptive words / total words (quite pessimistic)
# words per unique thing (Tenen) -- in just these descriptive passages; aka Unique Clutter Distance
# words per thing (Tenen) -- in just these descriptive passages (self-selecting sample); aka Clutter Distance

In [None]:
# topic model

In [None]:
# time series
# would need:
# number of fragments total
# number of descriptive fragments
# publish years for each work

In [None]:
# embeddings.. 
# universal sentence encoder, across each description, and then cluster together?
# looking for different authors creating similar descriptions ...