## Notebook Overview

Explore dataset from [RELiC](https://relic.cs.umass.edu/), a novel information retrieval task.

In [4]:
import json

In [5]:
'''
Read in .json of RELiC data as a nested dict.
Data structured as follows:

{
    book_n: {
        quotes: {
            quote_id: [
                [quote_n_left] # 4 sentences, left 'context' of critical claim
                sentence_id # index of the corresponding sentence in `sentences`
                sentence_window_size # number from 1-5 =to how many original sentences inhere the claim
                [quote_n_right] # 4 sentences, right 'context' of critical claim
            ], ...
        },
        sentences: [sentence_1, sentence_2], # all the sentences in the work
        candidates: { # tracks which sentences are eligible to be 'expanded' e.g. if sentence #7 is used in a claim that's 3 sentences long, we should retrieve sentences 7,8,9.
            1_sentence: [all sentence_id's],
            ...,
            5-sentence: [all but last 4 sentence_ids (prevent OOB error)], 
        }
    },
}
'''
def read_data(filename):
    
    f = open(filename)
    data = json.load(f)
    
    return data
    

In [6]:
data = read_data('data/relic-train.json')

In [16]:
# what books are in the dataset:
for i in enumerate(data):
    print(i)

(0, 'brothers_karamazov')
(1, 'to_the_lighthouse')
(2, 'the_pickwick_papers')
(3, 'david_copperfield')
(4, 'animal_farm')
(5, 'the_scarlet_letter')
(6, 'a_portrait_of_the_artist_as_a_young_man')
(7, 'the_turn_of_the_screw')
(8, 'the_souls_of_black_folk')
(9, 'adam_bede')
(10, 'sense_and_sensibility')
(11, 'martin_chuzzlewit')
(12, 'swanns_way')
(13, 'sister_carrie')
(14, 'daisy_miller')
(15, 'o_pioneers')
(16, 'the_red_badge_of_courage')
(17, 'little_dorrit')
(18, 'great_expectations')
(19, 'the_call_of_the_wild')
(20, 'mrs_dalloway')
(21, 'the_sport_of_the_gods')
(22, 'middlemarch')
(23, 'alices_adventures_in_wonderland')
(24, 'jacobs_room')
(25, '1984')
(26, 'house_of_mirth')
(27, 'nicholas_nickleby')
(28, 'moby_dick')
(29, 'oliver_twist')
(30, 'jane_eyre')
(31, 'this_side_of_paradise')
(32, 'madame_bovary')
(33, 'iola_leroy')
(34, 'frankenstein')
(35, 'the_age_of_innocence')
(36, 'lady_chatterlys_lover')
(37, 'maggie_a_girl_of_the_streets')
(38, 'wuthering_heights')
(39, 'the_ambass

In [38]:
'''
Read in .json of RELiC data as a nested dict.
Data structured as follows:

{
    book_n: {
        quotes: { 
            quote_n: [
                [quote_n_left] # 4 sentences, left 'context' of critical claim
                sentence_id # index of the corresponding sentence in `sentences`
                sentence_window_size # number from 1-5 =to how many original sentences inhere the claim
                [quote_n_right] # 4 sentences, right 'context' of critical claim
            ], ...
        },
        sentences: [sentence_1, sentence_2], # all the sentences in the work
        candidates: { # tracks which sentences are eligible to be 'expanded' e.g. if sentence #7 is used in a claim that's 3 sentences long, we should retrieve sentences 7,8,9.
            1_sentence: [all sentence_id's],
            ...,
            5-sentence: [all but last 4 sentence_ids (prevent OOB error)], 
        }
    },
}
'''

'''
for a given work in the dataset, extract passages deemed by critics to be "descriptive"
'''
def extract_passages(data, book_title, search_list):
    # get associated data for a single title
    book_data = data[book_title]
    
    quotes = book_data["quotes"]
    sentences = book_data["sentences"]
    
    descriptive_sentences = {}
    descriptive_count = 0
    claim_count = 0
    
    for quote_id in quotes:
        left_claim = ' '.join(quotes[quote_id][0]).lower()
        sentence_id = quotes[quote_id][1]
        quote_size = quotes[quote_id][2]
        right_claim = ' '.join(quotes[quote_id][3]).lower()
                
        descriptive = any(substring in left_claim for substring in search_list) or any(substring in right_claim for substring in search_list)
        
        if descriptive:
            descriptive_sentences[quote_id] = sentences[sentence_id: sentence_id+quote_size]
            descriptive_count +=1
        
        claim_count +=1

    print(f'Found {descriptive_count} out of {claim_count} passages in {book_title}')
    return descriptive_count

In [39]:
extract_passages(data, 'to_the_lighthouse', ['descri', 'detail', 'zoom in'])

Found 215 out of 1831 passages in to_the_lighthouse


215