## Notebook Overview

Explore dataset from [RELiC](https://relic.cs.umass.edu/), a novel information retrieval task.

### Extract Descriptive Passages

In [37]:
import json
import pandas as pd

# bad encoding
from ftfy import fix_encoding

# POS
import spacy

# nltk for wordnet and tokenization
import nltk
from nltk.corpus import wordnet as wn
from nltk import sent_tokenize
from nltk import word_tokenize

In [2]:
pd.set_option('display.max_colwidth', None)

In [28]:
'''
Read in .json of RELiC data as a nested dict.
Data structured as follows:

{
    book_n: {
        quotes: {
            quote_id: [
                [quote_n_left] # 4 sentences, left 'context' of critical claim
                sentence_id # index of the corresponding sentence in `sentences`
                sentence_window_size # number from 1-5 =to how many original sentences inhere the claim
                [quote_n_right] # 4 sentences, right 'context' of critical claim
            ], ...
        },
        sentences: [sentence_1, sentence_2], # all the sentences in the work
        candidates: { # tracks which sentences are eligible to be 'expanded' e.g. if sentence #7 is used in a claim that's 3 sentences long, we should retrieve sentences 7,8,9.
            1_sentence: [all sentence_id's],
            ...,
            5-sentence: [all but last 4 sentence_ids (prevent OOB error)], 
        }
    },
}
Return a dict containing requisite fields
'''
def read_data(filename):
    
    f = open(filename, encoding="UTF-8")
    data = json.load(f)
    
    return data
    

In [29]:
train_data = read_data('data/relic-train.json')

In [30]:
val_data = read_data('data/relic-val.json')

In [31]:
relic_data = z = {**train_data, **val_data}

In [32]:
# what books are in the dataset:
for i in enumerate(relic_data):
    print(i)

(0, 'brothers_karamazov')
(1, 'to_the_lighthouse')
(2, 'the_pickwick_papers')
(3, 'david_copperfield')
(4, 'animal_farm')
(5, 'the_scarlet_letter')
(6, 'a_portrait_of_the_artist_as_a_young_man')
(7, 'the_turn_of_the_screw')
(8, 'the_souls_of_black_folk')
(9, 'adam_bede')
(10, 'sense_and_sensibility')
(11, 'martin_chuzzlewit')
(12, 'swanns_way')
(13, 'sister_carrie')
(14, 'daisy_miller')
(15, 'o_pioneers')
(16, 'the_red_badge_of_courage')
(17, 'little_dorrit')
(18, 'great_expectations')
(19, 'the_call_of_the_wild')
(20, 'mrs_dalloway')
(21, 'the_sport_of_the_gods')
(22, 'middlemarch')
(23, 'alices_adventures_in_wonderland')
(24, 'jacobs_room')
(25, '1984')
(26, 'house_of_mirth')
(27, 'nicholas_nickleby')
(28, 'moby_dick')
(29, 'oliver_twist')
(30, 'jane_eyre')
(31, 'this_side_of_paradise')
(32, 'madame_bovary')
(33, 'iola_leroy')
(34, 'frankenstein')
(35, 'the_age_of_innocence')
(36, 'lady_chatterlys_lover')
(37, 'maggie_a_girl_of_the_streets')
(38, 'wuthering_heights')
(39, 'the_ambass

In [33]:
'''
Helper for determining if a list of sentences contains dialogue offsets (quote marks, -'s for Joyce, single-word sentences, etc.')
'''
def dialogue_helper(sentences):
    return [d for d in sentences 
            if '"' in d or '\"' in d 
            or '“' in d or '”' in d 
            or d.startswith('-') 
            or d.strip().startswith('-') 
            or len(d.strip().split()) == 1] != []

In [40]:
'''
for a given work in the dataset, extract passages deemed by critics to be "descriptive"
based on a 'search_list' containing substrings to look for in a given passage
this method excludes passages that contain dialogue
this pessimistic sanitization should result in more readily 'descriptive' passages
return: a cleaned version of the sentence
'''
def extract_passages(data, book_title, search_list, truncate_claims, exclude_dialogue):
    # get associated data for a single title
    book_data = data[book_title]
    
    quotes = book_data["quotes"]
    sentences = book_data["sentences"]
    
    descriptive_sentences = {}
    sentence_ids = set()
    descriptive_count = 0
    
    for quote_id in quotes:
        sentence_id = quotes[quote_id][1]
        quote_size = quotes[quote_id][2]
        
        # ensure the quoted passage has not already been recovered
        # if part of it has already been recovered, that's okay, it will just be attached 'principally' to only 1 claim
        # this just helps us track how many fragments total are looked at by critics
        if sentence_id in sentence_ids:
            continue
        else:
            sentence_ids.add(sentence_id)
            if quote_size == 1:
                sentence_ids.add(sentence_id + 1)
            elif quote_size == 2:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
            elif quote_size == 3:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
                sentence_ids.add(sentence_id + 3)
            elif quote_size == 4:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
                sentence_ids.add(sentence_id + 3)
                sentence_ids.add(sentence_id + 4)
            else:
                sentence_ids.add(sentence_id + 1)
                sentence_ids.add(sentence_id + 2)
                sentence_ids.add(sentence_id + 3)
                sentence_ids.add(sentence_id + 4)
                sentence_ids.add(sentence_id + 5)
        
        # like the paper, use windows of either full claim
        left_claim = ' '.join(quotes[quote_id][0]).lower()
        right_claim = ' '.join(quotes[quote_id][3]).lower()
        
        if left_claim is None or right_claim is None or left_claim == "" or right_claim == "":
            continue
        
        # or, if desired, 1 sentence on either side of the quote
        if truncate_claims:
            left_claim = sent_tokenize(left_claim)[-1]
            right_claim = sent_tokenize(right_claim)[0]
                
        # look for match on our phrases of interest
        descriptive = any(substring in left_claim for substring in search_list) or any(substring in right_claim for substring in search_list)
        
        if descriptive:
            # fix encoding issues with json conversion
            descriptive_sentence_raw = sentences[sentence_id: sentence_id+quote_size]
            
            descriptive_sentence = []
            for d in descriptive_sentence_raw:
                descriptive_sentence.append(fix_encoding(d))
            
            # ensure the quote does not contain any dialogue, and is at least longer than 1 word
            remove = False
            if exclude_dialogue:
                # check descriptive sentence
                if dialogue_helper(descriptive_sentence):
                    remove = True
            if remove:
                continue

            # strip extra whitespace and re-join multi-"sentence" passages that were split on ";", ":", "..."
            descriptive_sentence = ' '.join([x.strip() for x in descriptive_sentence])

            # track the critical claim connected to a given quote
            descriptive_sentences[str(quote_id)] = descriptive_sentence
            
            descriptive_count +=1
                
    print(f'Extracted {descriptive_count} out of {len(sentence_ids)} analyzed fragments in {book_title}.')
    return descriptive_sentences

In [41]:
# fill up dict with passages, grouped by book
descriptive_passages = {}
for i in enumerate(relic_data):
    print(i[1])
    descriptive_passages[i[1]] = extract_passages(relic_data, i[1], ['descri', 'detail', 'zoom', 'poet', 'lyric', 'vivid', 'imag'], truncate_claims = False, exclude_dialogue = True)

brothers_karamazov
Extracted 136 out of 3082 analyzed fragments in brothers_karamazov.
to_the_lighthouse
Extracted 220 out of 2182 analyzed fragments in to_the_lighthouse.
the_pickwick_papers
Extracted 127 out of 1750 analyzed fragments in the_pickwick_papers.
david_copperfield
Extracted 335 out of 3291 analyzed fragments in david_copperfield.
animal_farm
Extracted 32 out of 481 analyzed fragments in animal_farm.
the_scarlet_letter
Extracted 192 out of 1801 analyzed fragments in the_scarlet_letter.
a_portrait_of_the_artist_as_a_young_man
Extracted 421 out of 2881 analyzed fragments in a_portrait_of_the_artist_as_a_young_man.
the_turn_of_the_screw
Extracted 82 out of 1383 analyzed fragments in the_turn_of_the_screw.
the_souls_of_black_folk
Extracted 51 out of 702 analyzed fragments in the_souls_of_black_folk.
adam_bede
Extracted 86 out of 1052 analyzed fragments in adam_bede.
sense_and_sensibility
Extracted 64 out of 1411 analyzed fragments in sense_and_sensibility.
martin_chuzzlewit
Ex

In [42]:
# what are the descriptive passages in MD?
descriptive_passages['moby_dick']

{'030170': 'And Ahab chanced so to stand, that the Parsee occupied his shadow; while, if the Parsee’s shadow was there at all it seemed only to blend with, and lengthen Ahab’s.',
 '030172': 'Life folded Death; Death trellised Life; the grim god wived with youthful Life, and begat him curly-headed glories.',
 '030174': 'But as I was crowded for space, and wished the other parts of my body to remain a blank page for a poem I was then composing—at least, what untattooed parts might remain—I did not trouble myself with the odd inches; nor, indeed, should inches at all enter into a congenial admeasurement of the whale.',
 '030178': 'how cheerfully we consign ourselves to perdition!',
 '030188': 'But calm, snow-white, and unvarying; still directing its fountain of feathers to the sky; still beckoning us on from before, the solitary jet would at times be descried.',
 '030189': 'The unharming sharks, they glided by as if with padlocks on their mouths; the savage sea-hawks sailed with sheathed 

In [43]:
'''
create an intermediate results dataframe containing the relevant, extracted sentences along with supporting metadata
'''
def make_results_df(descriptive_passages, full_data, search_list, truncate_claims):
    
    books = []
    claim_ids = []
    left_claims = []
    left_claim_keywords = []
    right_claims = []
    right_claim_keywords = []
    passage_ids = []
    passages = []
    passage_sizes = []
    
    for book, data in descriptive_passages.items():
        for k, v in data.items():
            block = full_data[book]['quotes'][k]
            # title of book
            books.append(book)
            # claim id, linking critical claim to quoted passage
            claim_ids.append(k)
            # critical claim
            left_claim = ' '.join(block[0]).lower()
            right_claim = ' '.join(block[3]).lower()
            if truncate_claims:
                left_claim = sent_tokenize(left_claim)[-1]
                right_claim = sent_tokenize(right_claim)[0]
            
            # track claim
            left_claims.append(left_claim)
            right_claims.append(right_claim)
            
            # track which claim contains which keywords
            left_claim_keywords_temp = []
            right_claim_keywords_temp = []
                    
            for w in word_tokenize(left_claim):
                for term in search_list:
                    if term in w:
                        left_claim_keywords_temp.append(w)
                        
            for w in word_tokenize(right_claim):
                for term in search_list:
                    if term in w:
                        right_claim_keywords_temp.append(w)
            
            # trtoreack keywords
            left_claim_keywords.append(left_claim_keywords_temp)
            right_claim_keywords.append(right_claim_keywords_temp)
            
            # quoted passage
            passages.append(v)
            
            # 'starting' sentence id of passage
            passage_ids.append(block[1])
            
            # 'window size' of passage (based on authors' tokenization scheme)
            passage_sizes.append(block[2])
    
    return pd.DataFrame({'passage': passages, 'book': books, 'left_claim': left_claims, 'left_claim_keywords': left_claim_keywords, 'right_claim': right_claims, 'right_claim_keywords': right_claim_keywords, 'claim_id': claim_ids, 'passage_id': passage_ids, 'passage_size': passage_sizes})


In [44]:
df = make_results_df(descriptive_passages, relic_data, ['descri', 'detail', 'zoom', 'poet', 'lyric', 'vivid', 'imag'], truncate_claims=False)

In [45]:
df.shape

(8765, 9)

In [54]:
df[df['book'] == 'moby_dick'].head()

Unnamed: 0,passage,book,left_claim,left_claim_keywords,right_claim,right_claim_keywords,claim_id,passage_id,passage_size
3916,"And Ahab chanced so to stand, that the Parsee occupied his shadow; while, if the Parsee’s shadow was there at all it seemed only to blend with, and lengthen Ahab’s.",moby_dick,"he repeats those words which ahab narcissistically self-deluded does not realize are his own. finally, fedallah foreshadows ahab's death just as echo does narcissus'. related to the ""phantom"" imagery are moby-dick's numerous descriptions of fedallah as ahab's ""shadow,"" descriptions which establish an even firmer tie between ahab and the parsee and the narcissus story. while fedallah is comparing the whale's wrinkles to the lines in his palm, ""","[imagery, descriptions, descriptions]","starbuck defines fedallah as ahab's ""evil shadow"", and the crew speculates whether the parsee is a mortal being or ""a tremulous shadow cast upon the deck by some unseen being's body"". ishmael too senses that ahab and fedallah gaze at each other ""as if in the parsee ahab saw his forethrown shadow, in ahab the parsee his abandoned substance"". (the sentence itself balances like a reflection.) more importantly, ""the symphony"" portrays two episodes, whose ""shadow"" imagery plays heavily upon the narcissus motif.",[imagery],30170,7386,2
3917,"Life folded Death; Death trellised Life; the grim god wived with youthful Life, and begat him curly-headed glories.",moby_dick,"the space here is additionally described as a ""chapel"" and is the site of an altar maintained by the island's priestly caste. it is then appropriate that the skeleton itself is carefully located by ishmael in a shady glen, ""where a grand temple of lordly palms now sheltered it"". given the contested and chaotic state of affairs both in ishmael's narrative and aboard the decks of the pequod, the whale chapel is indeed ""a wondrous sight"". and the attractiveness of the stable, hierarchically ordered world represented by tranquo's whale skeleton is evident in the generative imagery of vines and flowers that clothe the bones and inspire ishmael's acclamation:","[described, imagery]","but it is important to note the reciprocal image in ishmael's paean. this wondrous whale is not the triumph of life over death, but rather their admixture. a similar ambivalence is on display in the image of ishmael's famous weaver god, who weaves ""and by that weaving is he deafened, that he hears no mortal voice; and by that humming, we, too, who look on the loom are deafened"".","[image, image]",30172,10031,3
3918,"But as I was crowded for space, and wished the other parts of my body to remain a blank page for a poem I was then composing—at least, what untattooed parts might remain—I did not trouble myself with the odd inches; nor, indeed, should inches at all enter into a congenial admeasurement of the whale.",moby_dick,"can this sort of romance be called a novel? obviously nobydick is not a novel in terms of most definitions. but it is also obvious that there are no absolute rules about novels in general-or in melville's words, about ""the art of telling the truth. ""\n10 therefore, we may classify those chapters, which in essence deal with actions like navigating, wat weaving, whale hunting, cuttingin, trying-out, and chasing as ""narratives without bothering much with the restrictions supposedly imposed upon the first-person narration-""",[],"taking advantage of this principle, i classify as ""nº even such a chapter as ""the chart"", which opens, ""had you followed captain ahab down into his cabin . ..\n* because, despite its opening, it describes the situations in the cabin that bring about the change of the ship's, as well as story's, course. if we anatomize moby dick; the thale in this way, about half of its chapters can be cataloged as 'narrative,"" although there is a definite narratological difference between the land-narrative sequence and the sea-narrative sequence interspersed with ""drama,"" ""cetology,"" and ""gam"" chapters.",[describes],30174,10065,2
3919,how cheerfully we consign ourselves to perdition!,moby_dick,"""\nbut being paid,what will compare with it? ""\neven here, however, ishmael turns the thought against those above him: momentarily it is he, a poor sailor, not they, enjoying the felicity of being paid. but-so his thought runs on-to receive much pay leads to wealth, and wealth leads to destruction, for money is ""the root of all earthly ills,"" and ""on no account can a monied man enter heaven.",[],"in this last line of thought two motifs are twined: that of money (coupled with injustice) and that of self-destruction, in the image of consigning ourselves to perdition. the first paired motif, that of money-injustice, comes in for fuller development in the following chapter in ishmael's lack of money for a night's lodging and in the passage about lazarus and dives. the second, that of self-destruction in the image of damnation or going-to-hell, first appears as such here but will often recur.","[image, image]",30178,97,1
3920,"But calm, snow-white, and unvarying; still directing its fountain of feathers to the sky; still beckoning us on from before, the solitary jet would at times be descried.",moby_dick,"the narrator ... the air. ""-ibid., pp. the ""spirit-spout"" in chapter li-""",[],"""-is related to ishmael's initial vision of the whale. in contrast, see the description of the painting in the entry of the spouter-inn in chapter iii, which is a kind of alternative vision. it is significant... realize themselves.-ishmael's character is adapted to his role as narrator;",[description],30188,5423,3


In [52]:
df[df['passage_id'] == '6525']

Unnamed: 0,passage,book,left_claim,left_claim_keywords,right_claim,right_claim_keywords,claim_id,passage_id,passage_size


### Analyze Critical Claims

* what keywords
    * categories / grounded coding
* what are they saying
* do claims require 'context' per se

#### Keywords -> Grounded Coding


In [37]:
def get_substr_match(l, key_sub):
    
    return key_sub in " ".join(l)

In [51]:
def get_which_contains_match(left_claim, right_claim, key_sub):
        
    if key_sub in " ".join(left_claim) and key_sub in " ".join(right_claim):
        return "Both"
        
    elif key_sub in " ".join(right_claim):
        return "Right"
    
    return "Left"

In [58]:
# filter down to left claim or right claim sublist contains 'descrip'
descriptive_df = df[df.apply(lambda x: get_substr_match(x.left_claim_keywords, 'descr'), axis=1) 
                | df.apply(lambda x: get_substr_match(x.right_claim_keywords, 'descr'), axis=1)]

# filter down to left claim or right claim sublist contains 'detail'
detail_df = df[df.apply(lambda x: get_substr_match(x.left_claim_keywords, 'detail'), axis=1) 
                | df.apply(lambda x: get_substr_match(x.right_claim_keywords, 'detail'), axis=1)]

In [59]:
descriptive_df.shape

(3973, 9)

In [60]:
detail_df.shape

(940, 9)

In [64]:
# add a column that tells me where I should look--for instance if both the left and right claim contain a word that's helpful

descr_match_output = descriptive_df.apply(lambda x: get_which_contains_match(x.left_claim_keywords, x.right_claim_keywords, 'descrip'), axis=1)

# add a column that tells me where I should look--for instance if both the left and right claim contain a word that's helpful

detail_match_output = detail_df.apply(lambda x: get_which_contains_match(x.left_claim_keywords, x.right_claim_keywords, 'detail'), axis=1)


In [65]:
descriptive_df['match_output'] = descr_match_output
detail_df['match_output'] = detail_match_output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  descriptive_df['match_output'] = descr_match_output
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  detail_df['match_output'] = detail_match_output


In [66]:
# shuffle detail df
detail_df = detail_df.sample(frac=1).reset_index(drop=True)
# shuffle descriptive df
descriptive_df = descriptive_df.sample(frac=1).reset_index(drop=True)

In [73]:
t = detail_df[detail_df['match_output'] == 'Both']

In [78]:
t[['claim_id','left_claim', 'passage', 'right_claim']].to_csv('detail_claims_subset.csv')

In [79]:
# write to .csv

detail_df.to_csv('detail_claims_subset.csv')
descriptive_df.to_csv('descriptive_claims_subset.csv')

### Analyze Descriptive Passages

* content
    * what types of passages
    * which passages are selected repeatedly
* parts of speech based
    * specificity [done]
* time series
    * where in works are passages
    * how does critical attention change over time
* topic modeling
* neural

#### Parts of Speech Based Analysis

In [17]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7ff3a80af820>)

In [18]:
#Function from http://www.nltk.org/howto/wordnet.html to get *all* of a synset's hyponym/hypernyms
hyper = lambda s: s.hypernyms()

In [19]:
'''
Consult wordnet for the situation of a noun and verb with respect to its station in the hypernym hierarchy. 
Based on current SOA, it is acceptable to simply grab the top-level (.01) synset.

Args:
    tagged_sample: a spacy doc

Return:
    specificity: a value conveying the "specificity" of the input, via Nelson (2020)
'''

def specificity(sample):
    tagged_sample=nlp(sample)
    hyper_sum = 0
    noun_and_verb_count = 0
    for word in tagged_sample:
        if not wn.synsets(word.lemma_):
            continue
        else:
            if word.pos_ == "NOUN" or word.pos_ == "VERB":
                noun_and_verb_count +=1
                # if it's a verb, get the most common verb hypernym chain
                # else, get the most common noun hypernym chain
                pos = word.pos_
                tag = "n" if pos.startswith("N") else "v"
                synset = word.lemma_ + "." + tag + ".01"
                hyper_sum += len(list(wn.synset(synset).closure(hyper)))
    
    return hyper_sum / noun_and_verb_count

In [None]:
# parts of speech work
# spaCy on each description
# column view a la Bal, Tenen
# specificity (Nelson 2020)
# descriptive words / total words (quite pessimistic)
# words per unique thing (Tenen) -- in just these descriptive passages; aka Unique Clutter Distance
# words per thing (Tenen) -- in just these descriptive passages (self-selecting sample); aka Clutter Distance

In [None]:
# topic model

In [None]:
# time series
# would need:
# number of fragments total
# number of descriptive fragments
# publish years for each work

In [None]:
# embeddings.. 
# universal sentence encoder, across each description, and then cluster together?
# looking for different authors creating similar descriptions ...