## Notebook Overview

Explore dataset from [RELiC](https://relic.cs.umass.edu/), a novel information retrieval task.

In [1]:
import json

In [2]:
'''
Read in .json of RELiC data as a nested dict.
Data structured as follows:

{
    book_n: {
        quotes: {
            quote_id: [
                [quote_n_left] # 4 sentences, left 'context' of critical claim
                sentence_id # index of the corresponding sentence in `sentences`
                sentence_window_size # number from 1-5 =to how many original sentences inhere the claim
                [quote_n_right] # 4 sentences, right 'context' of critical claim
            ], ...
        },
        sentences: [sentence_1, sentence_2], # all the sentences in the work
        candidates: { # tracks which sentences are eligible to be 'expanded' e.g. if sentence #7 is used in a claim that's 3 sentences long, we should retrieve sentences 7,8,9.
            1_sentence: [all sentence_id's],
            ...,
            5-sentence: [all but last 4 sentence_ids (prevent OOB error)], 
        }
    },
}
'''
def read_data(filename):
    
    f = open(filename)
    data = json.load(f)
    
    return data
    

In [3]:
data = read_data('data/relic-train.json')

In [4]:
# what books are in the dataset:
for i in enumerate(data):
    print(i)

(0, 'brothers_karamazov')
(1, 'to_the_lighthouse')
(2, 'the_pickwick_papers')
(3, 'david_copperfield')
(4, 'animal_farm')
(5, 'the_scarlet_letter')
(6, 'a_portrait_of_the_artist_as_a_young_man')
(7, 'the_turn_of_the_screw')
(8, 'the_souls_of_black_folk')
(9, 'adam_bede')
(10, 'sense_and_sensibility')
(11, 'martin_chuzzlewit')
(12, 'swanns_way')
(13, 'sister_carrie')
(14, 'daisy_miller')
(15, 'o_pioneers')
(16, 'the_red_badge_of_courage')
(17, 'little_dorrit')
(18, 'great_expectations')
(19, 'the_call_of_the_wild')
(20, 'mrs_dalloway')
(21, 'the_sport_of_the_gods')
(22, 'middlemarch')
(23, 'alices_adventures_in_wonderland')
(24, 'jacobs_room')
(25, '1984')
(26, 'house_of_mirth')
(27, 'nicholas_nickleby')
(28, 'moby_dick')
(29, 'oliver_twist')
(30, 'jane_eyre')
(31, 'this_side_of_paradise')
(32, 'madame_bovary')
(33, 'iola_leroy')
(34, 'frankenstein')
(35, 'the_age_of_innocence')
(36, 'lady_chatterlys_lover')
(37, 'maggie_a_girl_of_the_streets')
(38, 'wuthering_heights')
(39, 'the_ambass

In [180]:
'''
for a given work in the dataset, extract passages deemed by critics to be "descriptive"
'''
def extract_passages(data, book_title, search_list):
    # get associated data for a single title
    book_data = data[book_title]
    
    quotes = book_data["quotes"]
    sentences = book_data["sentences"]
    
    descriptive_sentences = {}
    sentence_ids = set()
    descriptive_count = 0
    claim_count = 0
    
    for quote_id in quotes:
        left_claim = ' '.join(quotes[quote_id][0]).lower()
        sentence_id = quotes[quote_id][1]
        quote_size = quotes[quote_id][2]
        right_claim = ' '.join(quotes[quote_id][3]).lower()
        
        # look for match on our phrases of interest
        descriptive = any(substring in left_claim for substring in search_list) or any(substring in right_claim for substring in search_list)
        
        if descriptive:
        # ensure the quoted passage has not already been tracked
            if sentence_id in sentence_ids or sentence_id + quote_size in sentence_ids:
                continue
            else:
                sentence_ids.add(sentence_id)
                sentence_ids.add(sentence_id + quote_size)

            descriptive_sentence_raw = sentences[sentence_id: sentence_id+quote_size]
            descriptive_sentence = []
            for d in descriptive_sentence_raw:
                bytes_string = bytes(d, encoding="raw_unicode_escape")
                descriptive_sentence.append(bytes_string.decode("utf-8", "strict"))
            
            # ensure the quote does not contain any dialogue, and is at least longer than 1 word
            if [d for d in descriptive_sentence if '"' in d or '\"' in d or '“' in d or '”' in d or d.startswith('-') or d.strip().startswith('-') or len(d.strip().split()) == 1] != []:
                continue

            # strip extra whitespace and re-join multi-"sentence" passages that were split on ";", ":", "..."
            descriptive_sentence = ' '.join([x.strip() for x in descriptive_sentence])

            # track the critical claim connected to a given quote
            descriptive_sentences[str(quote_id)] = descriptive_sentence

            descriptive_count +=1
                
        claim_count +=1

    print(f'Extracted {descriptive_count} out of {claim_count} analyzed passages in {book_title}.')
    return descriptive_sentences

In [181]:
descriptive_sentences = extract_passages(data, 'mansfield_park', ['descri', 'detail', 'zoom'])

Extracted 63 out of 756 analyzed passages in mansfield_park.


In [183]:
descriptive_sentences

{'050188': 'It was the abode of noise, disorder, and impropriety. Nobody was in their right place, nothing was done as it ought to be. She could not respect her parents as she had hoped.',
 '050212': 'that you could tolerate nothing that you were not used to; and a great deal more to the same purpose, to give them a knowledge of your character.',
 '050217': 'The day was uncommonly lovely. It was really March; but it was April in its mild air, brisk soft wind, and bright sun, occasionally clouded for a minute; and everything looked so beautiful under the influence of such a sky, the effects of the shadows pursuing each other on the ships at Spithead and the island beyond, with the ever-varying hues of the sea, now at high water, dancing in its glee and dashing against the ramparts with so fine a sound, produced altogether such a combination of charms for Fanny, as made her gradually almost careless of the circumstances under which she felt them.',
 '050218': 'she had never played the ga

In [None]:
'''
Stich together each descriptive sentence to its original paragraph
Return something like {passage:original paragraph}
Or perhaps, a dataframe, with columns: book title, claim ID /claim, passage, paragraph
'''
def paragraph_resolution(descriptive_sentences, book_title):
    
    