# Mining testimonial fragments of the Holocaust

**Experience domain:**

### Load the necessary libraries

In [1]:
import sys; sys.path.insert(0, '..')
import itertools

In [2]:
import get_topic_model_concordance as topic_concordancer
from utils import blacklab, db, text
mongo = db.get_db()

In [3]:
%config Completer.use_jedi = False
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import random

### Helper functions

In [4]:
def create_contextual_query(lemmas,context_length=50):
    permutations = itertools.permutations(lemmas,len(lemmas))
    final_result = []
    for element in list(permutations):
        temp_result = []
        for el in element:
            temp_result.append('[lemma="'+el+'"]')
        temp_result = '('+('[]{0,'+str(context_length)+'}').join(temp_result)+')'
        final_result.append(temp_result)
    final_result = '|'.join(final_result)
    return final_result
        
        
            

In [5]:
from utils import blacklab, db, text
import requests
import json
def find_sentence_id(label):
    props = {'annotators': 'tokenize'}

    # set the encoding of the annotator
    requests.encoding = 'utf-8'
    # make a request
    r = requests.post('http://localhost:9000/', params={'properties':
                      json.dumps(props)},
                      data=label.encode('utf-8'))
    result = json.loads(r.text, encoding='utf-8')
    query = []
    for i, token in enumerate(result['tokens']):

        if ('...'in token['word'] and ((i == 0) or
           i == len(result['tokens']) - 1)):
            continue
        elif ('...'in token['word']):
            query.append('[]{0,50}')
        elif ('-'in token['word']):
            query.append('[]{0,3}')
        elif ("n't"in token['word']):
            query.append('[]')
        elif ("'re"in token['word']):
            query.append('[]')
        elif ("?"in token['word']):
            query.append('[]')
        elif ("."in token['word']):
            query.append('[]')
        elif ("'s"in token['word']):
            query.append('[]')
        elif (","in token['word']):
            query.append('[]')
        else:
            query.append('["' + token['word'] + '"]')

    query = ' '.join(query)
    try:
        sentence = blacklab.search_blacklab(query, window=0,
                                            lemma=False,
                                            include_match=True)
        token_end = sentence[0]['token_end']
        token_start = sentence[0]['token_start']
        print (sentence[0])
        mongo = db.get_db()
        results = mongo.tokens.find({'testimony_id':
                                    sentence[0]['testimony_id']},
                                    {'_id': 0})
        tokens = list(results)[0]['tokens']
        sentenceStart = tokens[token_start]['sentence_index']
        sentenceEnd = tokens[token_end]['sentence_index']
        originalsentence = sentence[0]['complete_match']
        return (sentenceStart,sentenceEnd,sentence[0]['testimony_id'])
    except:
        print("The following query returned a null result")
        print(query)
        
            


In [6]:
def create_parent_node(label):
    """Generate a root node for a tree structure."""
    testimony_id = random.randint(1, 20)
    node = {}
    node['label'] = label
    fragment = {'label': label,
                'essay_id': random.randint(1, 20),
                'tree': get_node(testimony_id, node, is_parent=True)}
    fragment['tree']['label'] = label

    return fragment

In [7]:
def get_node(testimony_id, node, is_parent=False):
    """Generate a parent or leaf node for a tree structure."""
    if is_parent:
        return {
            'label': node['label'],
            'testimony_id': random.randint(1, 20),
            'media_index': random.randint(1, 20),
            'media_offset': random.randint(1, 20),
            'start_sentence_index': random.randint(1, 20),
            'end_sentence_index': random.randint(1, 20),
            'children': [], }
    else:
        return {'label': node['label'],
                'testimony_id': node['testimony_id'],
                'media_index': float(node['media_index']),
                'media_offset': float(node['media_offset']),
                'start_sentence_index': float(node['start_sentence_index']),
                'end_sentence_index': float(node['end_sentence_index']),
                'children': [], }

In [8]:
def check_if_main_node_exist(node):
    results = mongo.fragments.find({'label':node},{'_id': 0})
    if len(results[0])==0:
        return False
    else:
        return True

In [9]:
def add_main_node(label):
    mongo.fragments.insert(create_parent_node(label))

In [10]:
def delete_main_node(label):
    mongo.fragments.delete_one({'label':label})

In [11]:
def add_testimonial_fragments(fragments):
    if check_if_main_node_exist(fragments['main_node']):
        results = mongo.fragments.find({'label':fragments['main_node']},{'_id':0})[0]
        mid_nodes = [element['label'] for element in results['tree']['children']]
        if fragments['mid_node'] in mid_nodes:
            print ("mid node exists cannot be added")
        else:
            
            mid_node = get_node('r',{'label':fragments['mid_node']},is_parent=True)
            for fragment in fragments['fragments']:
                leaf = get_node(fragment['testimony_id'],fragment)
                mid_node['children'].append(leaf)
            results['tree']['children'].append(mid_node)
            mongo.fragments.replace_one({'label':fragments['main_node']},results)

### Add the main node

In [12]:
main_node = "leave"
delete_main_node(main_node)
add_main_node(main_node)

  


### Set up the query

query = '[lemma="leave"]'

result = topic_concordancer.main(query,window=20,topicn=25)

### Print the key topics

for i,element in enumerate(result['topic_documents']):
    print (i)
    topic_words =  element['topic_words'][1]
    print (topic_words)
    print ('\n')

### Analyze documents

i=0
for text in result['topic_documents'][i]['texts'][0:25]:
    print (text['matched_text_words'])
    print ('\n')

## Testimonial fragments

### 1.  

In [13]:
lemmas = ["leave","family"]

In [14]:
query = create_contextual_query(lemmas,context_length=10)
print (query)

([lemma="leave"][]{0,10}[lemma="family"])|([lemma="family"][]{0,10}[lemma="leave"])


In [15]:
domain_term = "family"

In [16]:
fragments = {}
fragments['main_node'] = main_node
fragments['mid_node'] = domain_term
fragments['fragments'] = []

In [17]:
fragment_1 = {}
fragment_1['original_sentence'] = "And I left really-- I was very close to my family"
fragment_1['label']="And I left really-- I was very close to my family (..)."
indices = find_sentence_id(fragment_1['original_sentence'])
fragment_1['start_sentence_index']=indices[0]
fragment_1['end_sentence_index']=indices[1]
fragment_1['media_offset'] = 0
fragment_1['media_index'] = 0
fragment_1['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_1)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22I%22%5D+%5B%22left%22%5D+%5B%22really%22%5D+%5B%5D%7B0%2C3%7D+%5B%22I%22%5D+%5B%22was%22%5D+%5B%22very%22%5D+%5B%22close%22%5D+%5B%22to%22%5D+%5B%22my%22%5D+%5B%22family%22%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And I left really -- I was very close to my family ', 'right': '', 'complete_match': 'And I left really -- I was very close to my family ', 'testimony_id': 'HVT-29', 'shelfmark': ['Fortunoff Archive HVT-29'], 'token_start': 7221, 'token_end': 7233}


In [18]:
fragment_2 = {}
fragment_2['original_sentence'] = "They have left their – their families to – to save themselves, and they joined the resistance."
fragment_2['label']="They have left their – their families to – to save themselves, and they joined the resistance."
indices = find_sentence_id(fragment_2['original_sentence'])
fragment_2['start_sentence_index']=indices[0]
fragment_2['end_sentence_index']=indices[1]
fragment_2['media_offset'] = 0
fragment_2['media_index'] = 0
fragment_2['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_2)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22They%22%5D+%5B%22have%22%5D+%5B%22left%22%5D+%5B%22their%22%5D+%5B%5D%7B0%2C3%7D+%5B%22their%22%5D+%5B%22families%22%5D+%5B%22to%22%5D+%5B%5D%7B0%2C3%7D+%5B%22to%22%5D+%5B%22save%22%5D+%5B%22themselves%22%5D+%5B%5D+%5B%22and%22%5D+%5B%22they%22%5D+%5B%22joined%22%5D+%5B%22the%22%5D+%5B%22resistance%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'They have left their – their families to – to save themselves , and they joined the resistance . ', 'right': '', 'complete_match': 'They have left their – their families to – to save themselves , and they joined the resistance . ', 'testimony_id': 'irn42018', 'shelfmark': ['USHMM RG-50.030*0586'], 'token_start': 23289, 'token_end': 23308}


In [19]:
fragment_3 = {}
fragment_3['original_sentence'] = "the constant worry about the family left behind was always there."
fragment_3['label']="(..)the constant worry about the family left behind was always there."
indices = find_sentence_id(fragment_3['original_sentence'])
fragment_3['start_sentence_index']=indices[0]
fragment_3['end_sentence_index']=indices[1]
fragment_3['media_offset'] = 0
fragment_3['media_index'] = 0
fragment_3['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_3)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22the%22%5D+%5B%22constant%22%5D+%5B%22worry%22%5D+%5B%22about%22%5D+%5B%22the%22%5D+%5B%22family%22%5D+%5B%22left%22%5D+%5B%22behind%22%5D+%5B%22was%22%5D+%5B%22always%22%5D+%5B%22there%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'the constant worry about the family left behind was always there . ', 'right': '', 'complete_match': 'the constant worry about the family left behind was always there . ', 'testimony_id': 'irn504627', 'shelfmark': ['USHMM RG-50.030*0133'], 'token_start': 4955, 'token_end': 4967}


In [20]:
fragment_4 = {}
fragment_4['original_sentence'] = "they were crying like little babies, that they left their family."
fragment_4['label']= "(..)they were crying like little babies, that they left their family."
indices = find_sentence_id(fragment_4['original_sentence'])
fragment_4['start_sentence_index']=indices[0]
fragment_4['end_sentence_index']=indices[1]
fragment_4['media_offset'] = 0
fragment_4['media_index'] = 0
fragment_4['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_4)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22they%22%5D+%5B%22were%22%5D+%5B%22crying%22%5D+%5B%22like%22%5D+%5B%22little%22%5D+%5B%22babies%22%5D+%5B%5D+%5B%22that%22%5D+%5B%22they%22%5D+%5B%22left%22%5D+%5B%22their%22%5D+%5B%22family%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'they were crying like little babies , that they left their family . ', 'right': '', 'complete_match': 'they were crying like little babies , that they left their family . ', 'testimony_id': 'irn504663', 'shelfmark': ['USHMM RG-50.030*0174'], 'token_start': 6003, 'token_end': 6016}


In [21]:
fragment_5 = {}
fragment_5['original_sentence'] = "a lot f Jews could be saved, they could save themselves to run away, or do something, but they wouldn’t do it because they didn’t want to leave their families, their fathers, their mothers, their sisters and brothers."
fragment_5['label']= "(..)to run away, or do something, but they wouldn’t do it because they didn’t want to leave their families (..)."
indices = find_sentence_id(fragment_5['original_sentence'])
fragment_5['start_sentence_index']=indices[0]
fragment_5['end_sentence_index']=indices[1]
fragment_5['media_offset'] = 0
fragment_5['media_index'] = 0
fragment_5['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_5)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22a%22%5D+%5B%22lot%22%5D+%5B%22f%22%5D+%5B%22Jews%22%5D+%5B%22could%22%5D+%5B%22be%22%5D+%5B%22saved%22%5D+%5B%5D+%5B%22they%22%5D+%5B%22could%22%5D+%5B%22save%22%5D+%5B%22themselves%22%5D+%5B%22to%22%5D+%5B%22run%22%5D+%5B%22away%22%5D+%5B%5D+%5B%22or%22%5D+%5B%22do%22%5D+%5B%22something%22%5D+%5B%5D+%5B%22but%22%5D+%5B%22they%22%5D+%5B%22would%22%5D+%5B%5D+%5B%22do%22%5D+%5B%22it%22%5D+%5B%22because%22%5D+%5B%22they%22%5D+%5B%22did%22%5D+%5B%5D+%5B%22want%22%5D+%5B%22to%22%5D+%5B%22leave%22%5D+%5B%22their%22%5D+%5B%22families%22%5D+%5B%5D+%5B%22their%22%5D+%5B%22fathers%22%5D+%5B%5D+%5B%22their%22%5D+%5B%22mothers%22%5D+%5B%5D+%5B%22their%22%5D+%5B%22sisters%22%5D+%5B%22and%22%5D+%5B%22brothers%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'a lot f Jews could be saved , they could save themselves to run away , or do something , but they would n’t do it because 

In [22]:
add_testimonial_fragments(fragments)

### 2.  

In [23]:
lemmas = ["leave","cry"]

In [24]:
query = create_contextual_query(lemmas,context_length=10)
print (query)

([lemma="leave"][]{0,10}[lemma="cry"])|([lemma="cry"][]{0,10}[lemma="leave"])


In [25]:
domain_term = "cry"

In [26]:
fragments = {}
fragments['main_node'] = main_node
fragments['mid_node'] = domain_term
fragments['fragments'] = []

In [27]:
fragment_1 = {}
fragment_1['original_sentence'] = "I remember crying after them leaving, and they were crying and – and I ran after them."
fragment_1['label']="I remember crying after them leaving, and they were crying and – and I ran after them."
indices = find_sentence_id(fragment_1['original_sentence'])
fragment_1['start_sentence_index']=indices[0]
fragment_1['end_sentence_index']=indices[1]
fragment_1['media_offset'] = 0
fragment_1['media_index'] = 0
fragment_1['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_1)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22I%22%5D+%5B%22remember%22%5D+%5B%22crying%22%5D+%5B%22after%22%5D+%5B%22them%22%5D+%5B%22leaving%22%5D+%5B%5D+%5B%22and%22%5D+%5B%22they%22%5D+%5B%22were%22%5D+%5B%22crying%22%5D+%5B%22and%22%5D+%5B%5D%7B0%2C3%7D+%5B%22and%22%5D+%5B%22I%22%5D+%5B%22ran%22%5D+%5B%22after%22%5D+%5B%22them%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'I remember crying after them leaving , and they were crying and – and I ran after them . ', 'right': '', 'complete_match': 'I remember crying after them leaving , and they were crying and – and I ran after them . ', 'testimony_id': 'irn41496', 'shelfmark': ['USHMM RG-50.030*0547'], 'token_start': 2698, 'token_end': 2717}


In [28]:
fragment_2 = {}
fragment_2['original_sentence'] = "the only time I have ever seen my father cry except for when I left "
fragment_2['label']="(..)the only time I have ever seen my father cry except for when I left (..)."
indices = find_sentence_id(fragment_2['original_sentence'])
fragment_2['start_sentence_index']=indices[0]
fragment_2['end_sentence_index']=indices[1]
fragment_2['media_offset'] = 0
fragment_2['media_index'] = 0
fragment_2['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_2)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22the%22%5D+%5B%22only%22%5D+%5B%22time%22%5D+%5B%22I%22%5D+%5B%22have%22%5D+%5B%22ever%22%5D+%5B%22seen%22%5D+%5B%22my%22%5D+%5B%22father%22%5D+%5B%22cry%22%5D+%5B%22except%22%5D+%5B%22for%22%5D+%5B%22when%22%5D+%5B%22I%22%5D+%5B%22left%22%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'the only time I have ever seen my father cry except for when I left ', 'right': '', 'complete_match': 'the only time I have ever seen my father cry except for when I left ', 'testimony_id': 'irn507583', 'shelfmark': ['USHMM RG-50.106*0086'], 'token_start': 5397, 'token_end': 5412}


In [29]:
fragment_3 = {}
fragment_3['original_sentence'] = " started crying tremendously, somebody from the family, that she's leaving, and nobody will see her"
fragment_3['label']="(..)started crying tremendously, somebody from the family, that she's leaving, and nobody will see her (..)"
indices = find_sentence_id(fragment_3['original_sentence'])
fragment_3['start_sentence_index']=indices[0]
fragment_3['end_sentence_index']=indices[1]
fragment_3['media_offset'] = 0
fragment_3['media_index'] = 0
fragment_3['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_3)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22started%22%5D+%5B%22crying%22%5D+%5B%22tremendously%22%5D+%5B%5D+%5B%22somebody%22%5D+%5B%22from%22%5D+%5B%22the%22%5D+%5B%22family%22%5D+%5B%5D+%5B%22that%22%5D+%5B%22she%22%5D+%5B%5D+%5B%22leaving%22%5D+%5B%5D+%5B%22and%22%5D+%5B%22nobody%22%5D+%5B%22will%22%5D+%5B%22see%22%5D+%5B%22her%22%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': "started crying tremendously , somebody from the family , that she 's leaving , and nobody will see her ", 'right': '', 'complete_match': "started crying tremendously , somebody from the family , that she 's leaving , and nobody will see her ", 'testimony_id': 'usc_shoah_11144', 'shelfmark': ['USC Shoah Foundation 11144'], 'token_start': 2934, 'token_end': 2953}


In [30]:
fragment_4 = {}
fragment_4['original_sentence'] = "And Mother and Father bravely smiled at us until the time came for the train to leave. And I cried impulsively,"
fragment_4['label']= "And Mother and Father bravely smiled at us until the time came for the train to leave. And I cried impulsively (..)."
indices = find_sentence_id(fragment_4['original_sentence'])
fragment_4['start_sentence_index']=indices[0]
fragment_4['end_sentence_index']=indices[1]
fragment_4['media_offset'] = 0
fragment_4['media_index'] = 0
fragment_4['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_4)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22Mother%22%5D+%5B%22and%22%5D+%5B%22Father%22%5D+%5B%22bravely%22%5D+%5B%22smiled%22%5D+%5B%22at%22%5D+%5B%22us%22%5D+%5B%22until%22%5D+%5B%22the%22%5D+%5B%22time%22%5D+%5B%22came%22%5D+%5B%22for%22%5D+%5B%22the%22%5D+%5B%22train%22%5D+%5B%22to%22%5D+%5B%22leave%22%5D+%5B%5D+%5B%22And%22%5D+%5B%22I%22%5D+%5B%22cried%22%5D+%5B%22impulsively%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And Mother and Father bravely smiled at us until the time came for the train to leave . And I cried impulsively , ', 'right': '', 'complete_match': 'And Mother and Father bravely smiled at us until the time came for the train to leave . And I cried impulsively , ', 'testimony_id': 'usc_shoah_21626', 'shelfmark': ['USC Shoah Foundation 21626'], 'token_start': 6881, 'token_end': 6904}


In [31]:
fragment_5 = {}
fragment_5['original_sentence'] = "And we were leaving, and then Laura turned around and, crying, ran again into his arms, saying-- kissing him."
fragment_5['label']= "And we were leaving, and then Laura turned around and, crying, ran again into his arms, saying-- kissing him."
indices = find_sentence_id(fragment_5['original_sentence'])
fragment_5['start_sentence_index']=indices[0]
fragment_5['end_sentence_index']=indices[1]
fragment_5['media_offset'] = 0
fragment_5['media_index'] = 0
fragment_5['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_5)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22we%22%5D+%5B%22were%22%5D+%5B%22leaving%22%5D+%5B%5D+%5B%22and%22%5D+%5B%22then%22%5D+%5B%22Laura%22%5D+%5B%22turned%22%5D+%5B%22around%22%5D+%5B%22and%22%5D+%5B%5D+%5B%22crying%22%5D+%5B%5D+%5B%22ran%22%5D+%5B%22again%22%5D+%5B%22into%22%5D+%5B%22his%22%5D+%5B%22arms%22%5D+%5B%5D+%5B%22saying%22%5D+%5B%5D%7B0%2C3%7D+%5B%22kissing%22%5D+%5B%22him%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And we were leaving , and then Laura turned around and , crying , ran again into his arms , saying -- kissing him . ', 'right': '', 'complete_match': 'And we were leaving , and then Laura turned around and , crying , ran again into his arms , saying -- kissing him . ', 'testimony_id': 'usc_shoah_7455', 'shelfmark': ['USC Shoah Foundation 7455'], 'token_start': 20875, 'token_end': 20900}


In [32]:
add_testimonial_fragments(fragments)

### 3.  

In [33]:
lemmas = ["leave","kill"]

In [34]:
query = create_contextual_query(lemmas,context_length=15)
print (query)

([lemma="leave"][]{0,15}[lemma="kill"])|([lemma="kill"][]{0,15}[lemma="leave"])


In [35]:
domain_term = "kill"

In [36]:
fragments = {}
fragments['main_node'] = main_node
fragments['mid_node'] = domain_term
fragments['fragments'] = []

In [37]:
fragment_1 = {}
fragment_1['original_sentence'] = "All the Jews got killed. The Jews in Norden, in Holland, had almost nobody left."
fragment_1['label']="All the Jews got killed. The Jews in Norden, in Holland, had almost nobody left."
indices = find_sentence_id(fragment_1['original_sentence'])
fragment_1['start_sentence_index']=indices[0]
fragment_1['end_sentence_index']=indices[1]
fragment_1['media_offset'] = 0
fragment_1['media_index'] = 0
fragment_1['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_1)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22All%22%5D+%5B%22the%22%5D+%5B%22Jews%22%5D+%5B%22got%22%5D+%5B%22killed%22%5D+%5B%5D+%5B%22The%22%5D+%5B%22Jews%22%5D+%5B%22in%22%5D+%5B%22Norden%22%5D+%5B%5D+%5B%22in%22%5D+%5B%22Holland%22%5D+%5B%5D+%5B%22had%22%5D+%5B%22almost%22%5D+%5B%22nobody%22%5D+%5B%22left%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'All the Jews got killed . The Jews in Norden , in Holland , had almost nobody left . ', 'right': '', 'complete_match': 'All the Jews got killed . The Jews in Norden , in Holland , had almost nobody left . ', 'testimony_id': 'HVT-42', 'shelfmark': ['Fortunoff Archive HVT-42'], 'token_start': 10557, 'token_end': 10576}


In [38]:
fragment_2 = {}
fragment_2['original_sentence'] = "The one who had in Poland, they all been killed. There was just nobody left but me."
fragment_2['label']="(..)they all been killed. There was just nobody left but me."
indices = find_sentence_id(fragment_2['original_sentence'])
fragment_2['start_sentence_index']=indices[0]
fragment_2['end_sentence_index']=indices[1]
fragment_2['media_offset'] = 0
fragment_2['media_index'] = 0
fragment_2['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_2)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22The%22%5D+%5B%22one%22%5D+%5B%22who%22%5D+%5B%22had%22%5D+%5B%22in%22%5D+%5B%22Poland%22%5D+%5B%5D+%5B%22they%22%5D+%5B%22all%22%5D+%5B%22been%22%5D+%5B%22killed%22%5D+%5B%5D+%5B%22There%22%5D+%5B%22was%22%5D+%5B%22just%22%5D+%5B%22nobody%22%5D+%5B%22left%22%5D+%5B%22but%22%5D+%5B%22me%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'The one who had in Poland , they all been killed . There was just nobody left but me . ', 'right': '', 'complete_match': 'The one who had in Poland , they all been killed . There was just nobody left but me . ', 'testimony_id': 'irn504565', 'shelfmark': ['USHMM RG-50.030*0069'], 'token_start': 448, 'token_end': 468}


In [39]:
fragment_3 = {}
fragment_3['original_sentence'] = "And they were killed. So from all our group, there were four left."
fragment_3['label']="And they were killed. So from all our group, there were four left."
indices = find_sentence_id(fragment_3['original_sentence'])
fragment_3['start_sentence_index']=indices[0]
fragment_3['end_sentence_index']=indices[1]
fragment_3['media_offset'] = 0
fragment_3['media_index'] = 0
fragment_3['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_3)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22they%22%5D+%5B%22were%22%5D+%5B%22killed%22%5D+%5B%5D+%5B%22So%22%5D+%5B%22from%22%5D+%5B%22all%22%5D+%5B%22our%22%5D+%5B%22group%22%5D+%5B%5D+%5B%22there%22%5D+%5B%22were%22%5D+%5B%22four%22%5D+%5B%22left%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And they were killed . So from all our group , there were four left . ', 'right': '', 'complete_match': 'And they were killed . So from all our group , there were four left . ', 'testimony_id': 'irn504693', 'shelfmark': ['USHMM RG-50.030*0199'], 'token_start': 38370, 'token_end': 38386}


In [40]:
fragment_4 = {}
fragment_4['original_sentence'] = "And there was nobody left. Everybody was killed that night."
fragment_4['label']= "And there was nobody left. Everybody was killed that night."
indices = find_sentence_id(fragment_4['original_sentence'])
fragment_4['start_sentence_index']=indices[0]
fragment_4['end_sentence_index']=indices[1]
fragment_4['media_offset'] = 0
fragment_4['media_index'] = 0
fragment_4['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_4)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22there%22%5D+%5B%22was%22%5D+%5B%22nobody%22%5D+%5B%22left%22%5D+%5B%5D+%5B%22Everybody%22%5D+%5B%22was%22%5D+%5B%22killed%22%5D+%5B%22that%22%5D+%5B%22night%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And there was nobody left . Everybody was killed that night . ', 'right': '', 'complete_match': 'And there was nobody left . Everybody was killed that night . ', 'testimony_id': 'usc_shoah_2316', 'shelfmark': ['USC Shoah Foundation 2316'], 'token_start': 15159, 'token_end': 15171}


In [41]:
fragment_5 = {}
fragment_5['original_sentence'] = " It was horrible. Because your whole family was killed and you have nobody left"
fragment_5['label']= "It was horrible. Because your whole family was killed and you have nobody left (..)."
indices = find_sentence_id(fragment_5['original_sentence'])
fragment_5['start_sentence_index']=indices[0]
fragment_5['end_sentence_index']=indices[1]
fragment_5['media_offset'] = 0
fragment_5['media_index'] = 0
fragment_5['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_5)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22It%22%5D+%5B%22was%22%5D+%5B%22horrible%22%5D+%5B%5D+%5B%22Because%22%5D+%5B%22your%22%5D+%5B%22whole%22%5D+%5B%22family%22%5D+%5B%22was%22%5D+%5B%22killed%22%5D+%5B%22and%22%5D+%5B%22you%22%5D+%5B%22have%22%5D+%5B%22nobody%22%5D+%5B%22left%22%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'It was horrible . Because your whole family was killed and you have nobody left ', 'right': '', 'complete_match': 'It was horrible . Because your whole family was killed and you have nobody left ', 'testimony_id': 'irn506671', 'shelfmark': ['USHMM RG-50.549.02*0014'], 'token_start': 22428, 'token_end': 22443}


In [42]:
add_testimonial_fragments(fragments)