# Mining testimonial fragments of the Holocaust

**Experience domain:**

### Load the necessary libraries

In [1]:
import sys; sys.path.insert(0, '..')
import itertools

In [2]:
import get_topic_model_concordance as topic_concordancer
from utils import blacklab, db, text
mongo = db.get_db()

In [3]:
%config Completer.use_jedi = False
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import random

### Helper functions

In [4]:
def create_contextual_query(lemmas,context_length=50):
    permutations = itertools.permutations(lemmas,len(lemmas))
    final_result = []
    for element in list(permutations):
        temp_result = []
        for el in element:
            temp_result.append('[lemma="'+el+'"]')
        temp_result = '('+('[]{0,'+str(context_length)+'}').join(temp_result)+')'
        final_result.append(temp_result)
    final_result = '|'.join(final_result)
    return final_result
        
        
            

In [5]:
from utils import blacklab, db, text
import requests
import json
def find_sentence_id(label):
    props = {'annotators': 'tokenize'}

    # set the encoding of the annotator
    requests.encoding = 'utf-8'
    # make a request
    r = requests.post('http://localhost:9000/', params={'properties':
                      json.dumps(props)},
                      data=label.encode('utf-8'))
    result = json.loads(r.text, encoding='utf-8')
    query = []
    for i, token in enumerate(result['tokens']):

        if ('...'in token['word'] and ((i == 0) or
           i == len(result['tokens']) - 1)):
            continue
        elif ('...'in token['word']):
            query.append('[]{0,50}')
        elif ('-'in token['word']):
            query.append('[]{0,3}')
        elif ("n't"in token['word']):
            query.append('[]')
        elif ("'re"in token['word']):
            query.append('[]')
        elif ("?"in token['word']):
            query.append('[]')
        elif ("."in token['word']):
            query.append('[]')
        elif ("'s"in token['word']):
            query.append('[]')
        elif (","in token['word']):
            query.append('[]')
        else:
            query.append('["' + token['word'] + '"]')

    query = ' '.join(query)
    try:
        sentence = blacklab.search_blacklab(query, window=0,
                                            lemma=False,
                                            include_match=True)
        token_end = sentence[0]['token_end']
        token_start = sentence[0]['token_start']
        print (sentence[0])
        mongo = db.get_db()
        results = mongo.tokens.find({'testimony_id':
                                    sentence[0]['testimony_id']},
                                    {'_id': 0})
        tokens = list(results)[0]['tokens']
        sentenceStart = tokens[token_start]['sentence_index']
        sentenceEnd = tokens[token_end]['sentence_index']
        originalsentence = sentence[0]['complete_match']
        return (sentenceStart,sentenceEnd,sentence[0]['testimony_id'])
    except:
        print("The following query returned a null result")
        print(query)
        
            


In [6]:
def create_parent_node(label):
    """Generate a root node for a tree structure."""
    testimony_id = random.randint(1, 20)
    node = {}
    node['label'] = label
    fragment = {'label': label,
                'essay_id': random.randint(1, 20),
                'tree': get_node(testimony_id, node, is_parent=True)}
    fragment['tree']['label'] = label

    return fragment

In [7]:
def get_node(testimony_id, node, is_parent=False):
    """Generate a parent or leaf node for a tree structure."""
    if is_parent:
        return {
            'label': node['label'],
            'testimony_id': random.randint(1, 20),
            'media_index': random.randint(1, 20),
            'media_offset': random.randint(1, 20),
            'start_sentence_index': random.randint(1, 20),
            'end_sentence_index': random.randint(1, 20),
            'children': [], }
    else:
        return {'label': node['label'],
                'testimony_id': node['testimony_id'],
                'media_index': float(node['media_index']),
                'media_offset': float(node['media_offset']),
                'start_sentence_index': float(node['start_sentence_index']),
                'end_sentence_index': float(node['end_sentence_index']),
                'children': [], }

In [8]:
def check_if_main_node_exist(node):
    results = mongo.fragments.find({'label':node},{'_id': 0})
    if len(results[0])==0:
        return False
    else:
        return True

In [9]:
def add_main_node(label):
    mongo.fragments.insert(create_parent_node(label))

In [10]:
def delete_main_node(label):
    mongo.fragments.delete_one({'label':label})

In [11]:
def add_testimonial_fragments(fragments):
    if check_if_main_node_exist(fragments['main_node']):
        results = mongo.fragments.find({'label':fragments['main_node']},{'_id':0})[0]
        mid_nodes = [element['label'] for element in results['tree']['children']]
        if fragments['mid_node'] in mid_nodes:
            print ("mid node exists cannot be added")
        else:
            
            mid_node = get_node('r',{'label':fragments['mid_node']},is_parent=True)
            for fragment in fragments['fragments']:
                leaf = get_node(fragment['testimony_id'],fragment)
                mid_node['children'].append(leaf)
            results['tree']['children'].append(mid_node)
            mongo.fragments.replace_one({'label':fragments['main_node']},results)

### Add the main node

In [12]:
main_node = "laugh"
delete_main_node(main_node)
add_main_node(main_node)

  


### Set up the query

query = '[lemma="laugh"]'

result = topic_concordancer.main(query,window=25,topicn=25)

### Print the key topics

for i,element in enumerate(result['topic_documents']):
    print (i)
    topic_words =  element['topic_words'][1]
    print (topic_words)
    print ('\n')

### Analyze documents

i=10
for text in result['topic_documents'][i]['texts'][0:25]:
    print (text['matched_text_words'])
    print ('\n')

## Testimonial fragments

### 1.  

In [13]:
lemmas = ["laugh","beat"]

In [14]:
query = create_contextual_query(lemmas,context_length=25)
print (query)

([lemma="laugh"][]{0,25}[lemma="beat"])|([lemma="beat"][]{0,25}[lemma="laugh"])


In [15]:
domain_term = "beat"

In [16]:
fragments = {}
fragments['main_node'] = main_node
fragments['mid_node'] = domain_term
fragments['fragments'] = []

In [17]:
fragment_1 = {}
fragment_1['original_sentence'] = "There were some brown shirts and they were beating up an old Jewish man with a long beard and they were beating him up, and there was people standing around laughing and applauding and, as I say, it was sort of like a Roman circus sort of atmosphere that night."
fragment_1['label']="(..) they were beating up an old Jewish man with a long beard and they were beating him up, and there was people standing around laughing and applauding (..)"
indices = find_sentence_id(fragment_1['original_sentence'])
fragment_1['start_sentence_index']=indices[0]
fragment_1['end_sentence_index']=indices[1]
fragment_1['media_offset'] = 0
fragment_1['media_index'] = 0
fragment_1['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_1)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22There%22%5D+%5B%22were%22%5D+%5B%22some%22%5D+%5B%22brown%22%5D+%5B%22shirts%22%5D+%5B%22and%22%5D+%5B%22they%22%5D+%5B%22were%22%5D+%5B%22beating%22%5D+%5B%22up%22%5D+%5B%22an%22%5D+%5B%22old%22%5D+%5B%22Jewish%22%5D+%5B%22man%22%5D+%5B%22with%22%5D+%5B%22a%22%5D+%5B%22long%22%5D+%5B%22beard%22%5D+%5B%22and%22%5D+%5B%22they%22%5D+%5B%22were%22%5D+%5B%22beating%22%5D+%5B%22him%22%5D+%5B%22up%22%5D+%5B%5D+%5B%22and%22%5D+%5B%22there%22%5D+%5B%22was%22%5D+%5B%22people%22%5D+%5B%22standing%22%5D+%5B%22around%22%5D+%5B%22laughing%22%5D+%5B%22and%22%5D+%5B%22applauding%22%5D+%5B%22and%22%5D+%5B%5D+%5B%22as%22%5D+%5B%22I%22%5D+%5B%22say%22%5D+%5B%5D+%5B%22it%22%5D+%5B%22was%22%5D+%5B%22sort%22%5D+%5B%22of%22%5D+%5B%22like%22%5D+%5B%22a%22%5D+%5B%22Roman%22%5D+%5B%22circus%22%5D+%5B%22sort%22%5D+%5B%22of%22%5D+%5B%22atmosphere%22%5D+%5B%22that%22%5D+%5B%22night%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&words

In [18]:
fragment_2 = {}
fragment_2['original_sentence'] = "And they were beating us and laughing about us."
fragment_2['label']=" And they were beating us and laughing about us."
indices = find_sentence_id(fragment_2['original_sentence'])
fragment_2['start_sentence_index']=indices[0]
fragment_2['end_sentence_index']=indices[1]
fragment_2['media_offset'] = 0
fragment_2['media_index'] = 0
fragment_2['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_2)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22they%22%5D+%5B%22were%22%5D+%5B%22beating%22%5D+%5B%22us%22%5D+%5B%22and%22%5D+%5B%22laughing%22%5D+%5B%22about%22%5D+%5B%22us%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And they were beating us and laughing about us . ', 'right': '', 'complete_match': 'And they were beating us and laughing about us . ', 'testimony_id': 'usc_shoah_1537', 'shelfmark': ['USC Shoah Foundation 1537'], 'token_start': 4480, 'token_end': 4490}


In [19]:
fragment_3 = {}
fragment_3['original_sentence'] = "And where would they beat them mostly? On their sex organs, where it hurts the most. And they would stand there and laugh."
fragment_3['label']="And where would they beat them mostly? On their sex organs, where it hurts the most. And they would stand there and laugh."
indices = find_sentence_id(fragment_3['original_sentence'])
fragment_3['start_sentence_index']=indices[0]
fragment_3['end_sentence_index']=indices[1]
fragment_3['media_offset'] = 0
fragment_3['media_index'] = 0
fragment_3['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_3)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22And%22%5D+%5B%22where%22%5D+%5B%22would%22%5D+%5B%22they%22%5D+%5B%22beat%22%5D+%5B%22them%22%5D+%5B%22mostly%22%5D+%5B%5D+%5B%22On%22%5D+%5B%22their%22%5D+%5B%22sex%22%5D+%5B%22organs%22%5D+%5B%5D+%5B%22where%22%5D+%5B%22it%22%5D+%5B%22hurts%22%5D+%5B%22the%22%5D+%5B%22most%22%5D+%5B%5D+%5B%22And%22%5D+%5B%22they%22%5D+%5B%22would%22%5D+%5B%22stand%22%5D+%5B%22there%22%5D+%5B%22and%22%5D+%5B%22laugh%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'And where would they beat them mostly ? On their sex organs , where it hurts the most . And they would stand there and laugh . ', 'right': '', 'complete_match': 'And where would they beat them mostly ? On their sex organs , where it hurts the most . And they would stand there and laugh . ', 'testimony_id': 'irn505558', 'shelfmark': ['USHMM RG-50.042*0004'], 'token_start': 11655, 'token_end': 11682}


In [20]:
fragment_4 = {}
fragment_4['original_sentence'] = "as more they beat you with the back of a rifle into the small of your back, the more they were laughing"
fragment_4['label']= "(..) as more they beat you with the back of a rifle into the small of your back, the more they were laughing"
indices = find_sentence_id(fragment_4['original_sentence'])
fragment_4['start_sentence_index']=indices[0]
fragment_4['end_sentence_index']=indices[1]
fragment_4['media_offset'] = 0
fragment_4['media_index'] = 0
fragment_4['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_4)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22as%22%5D+%5B%22more%22%5D+%5B%22they%22%5D+%5B%22beat%22%5D+%5B%22you%22%5D+%5B%22with%22%5D+%5B%22the%22%5D+%5B%22back%22%5D+%5B%22of%22%5D+%5B%22a%22%5D+%5B%22rifle%22%5D+%5B%22into%22%5D+%5B%22the%22%5D+%5B%22small%22%5D+%5B%22of%22%5D+%5B%22your%22%5D+%5B%22back%22%5D+%5B%5D+%5B%22the%22%5D+%5B%22more%22%5D+%5B%22they%22%5D+%5B%22were%22%5D+%5B%22laughing%22%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'as more they beat you with the back of a rifle into the small of your back , the more they were laughing ', 'right': '', 'complete_match': 'as more they beat you with the back of a rifle into the small of your back , the more they were laughing ', 'testimony_id': 'irn505558', 'shelfmark': ['USHMM RG-50.042*0004'], 'token_start': 15494, 'token_end': 15517}


In [21]:
add_testimonial_fragments(fragments)

### 2.  

In [22]:
lemmas = ["not","laugh"]

In [23]:
query = create_contextual_query(lemmas,context_length=1)
print (query)

([lemma="not"][]{0,1}[lemma="laugh"])|([lemma="laugh"][]{0,1}[lemma="not"])


In [24]:
domain_term = "cannot"

In [25]:
fragments = {}
fragments['main_node'] = main_node
fragments['mid_node'] = domain_term
fragments['fragments'] = []

In [26]:
fragment_1 = {}
fragment_1['original_sentence'] = "So he killed my father right in front of me. And I froze. I, I couldn't cry. I couldn't laugh. I couldn't do anything."
fragment_1['label']="So he killed my father right in front of me. And I froze. I, I couldn't cry. I couldn't laugh. I couldn't do anything."
indices = find_sentence_id(fragment_1['original_sentence'])
fragment_1['start_sentence_index']=indices[0]
fragment_1['end_sentence_index']=indices[1]
fragment_1['media_offset'] = 0
fragment_1['media_index'] = 0
fragment_1['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_1)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22So%22%5D+%5B%22he%22%5D+%5B%22killed%22%5D+%5B%22my%22%5D+%5B%22father%22%5D+%5B%22right%22%5D+%5B%22in%22%5D+%5B%22front%22%5D+%5B%22of%22%5D+%5B%22me%22%5D+%5B%5D+%5B%22And%22%5D+%5B%22I%22%5D+%5B%22froze%22%5D+%5B%5D+%5B%22I%22%5D+%5B%5D+%5B%22I%22%5D+%5B%22could%22%5D+%5B%5D+%5B%22cry%22%5D+%5B%5D+%5B%22I%22%5D+%5B%22could%22%5D+%5B%5D+%5B%22laugh%22%5D+%5B%5D+%5B%22I%22%5D+%5B%22could%22%5D+%5B%5D+%5B%22do%22%5D+%5B%22anything%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': "So he killed my father right in front of me . And I froze . I , I could n't cry . I could n't laugh . I could n't do anything . ", 'right': '', 'complete_match': "So he killed my father right in front of me . And I froze . I , I could n't cry . I could n't laugh . I could n't do anything . ", 'testimony_id': 'usc_shoah_8002', 'shelfmark': ['USC Shoah Foundation 8002'], 'token_start': 2599

In [27]:
fragment_2 = {}
fragment_2['original_sentence'] = "Like I say, now I can laugh, because at that time I couldn’t laugh. Because it hurt us."
fragment_2['label']="Like I say, now I can laugh, because at that time I couldn’t laugh. Because it hurt us."
indices = find_sentence_id(fragment_2['original_sentence'])
fragment_2['start_sentence_index']=indices[0]
fragment_2['end_sentence_index']=indices[1]
fragment_2['media_offset'] = 0
fragment_2['media_index'] = 0
fragment_2['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_2)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22Like%22%5D+%5B%22I%22%5D+%5B%22say%22%5D+%5B%5D+%5B%22now%22%5D+%5B%22I%22%5D+%5B%22can%22%5D+%5B%22laugh%22%5D+%5B%5D+%5B%22because%22%5D+%5B%22at%22%5D+%5B%22that%22%5D+%5B%22time%22%5D+%5B%22I%22%5D+%5B%22could%22%5D+%5B%5D+%5B%22laugh%22%5D+%5B%5D+%5B%22Because%22%5D+%5B%22it%22%5D+%5B%22hurt%22%5D+%5B%22us%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'Like I say , now I can laugh , because at that time I could n’t laugh . Because it hurt us . ', 'right': '', 'complete_match': 'Like I say , now I can laugh , because at that time I could n’t laugh . Because it hurt us . ', 'testimony_id': 'irn510703', 'shelfmark': ['USHMM RG-50.156*0049'], 'token_start': 21122, 'token_end': 21145}


In [28]:
fragment_3 = {}
fragment_3['original_sentence'] = "I cannot laugh as wholeheartedly as anyone else can laugh"
fragment_3['label']="I cannot laugh as wholeheartedly as anyone else can laugh (..)"
indices = find_sentence_id(fragment_3['original_sentence'])
fragment_3['start_sentence_index']=indices[0]
fragment_3['end_sentence_index']=indices[1]
fragment_3['media_offset'] = 0
fragment_3['media_index'] = 0
fragment_3['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_3)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22I%22%5D+%5B%22can%22%5D+%5B%22not%22%5D+%5B%22laugh%22%5D+%5B%22as%22%5D+%5B%22wholeheartedly%22%5D+%5B%22as%22%5D+%5B%22anyone%22%5D+%5B%22else%22%5D+%5B%22can%22%5D+%5B%22laugh%22%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': 'I can not laugh as wholeheartedly as anyone else can laugh ', 'right': '', 'complete_match': 'I can not laugh as wholeheartedly as anyone else can laugh ', 'testimony_id': 'HVT-43', 'shelfmark': ['Fortunoff Archive HVT-43'], 'token_start': 28946, 'token_end': 28957}


In [29]:
fragment_4 = {}
fragment_4['original_sentence'] = "A cheerful movie might bring a little grin on my face, but I haven't heard myself heartily laughing."
fragment_4['label']= " A cheerful movie might bring a little grin on my face, but I haven't heard myself heartily laughing."
indices = find_sentence_id(fragment_4['original_sentence'])
fragment_4['start_sentence_index']=indices[0]
fragment_4['end_sentence_index']=indices[1]
fragment_4['media_offset'] = 0
fragment_4['media_index'] = 0
fragment_4['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_4)


http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22A%22%5D+%5B%22cheerful%22%5D+%5B%22movie%22%5D+%5B%22might%22%5D+%5B%22bring%22%5D+%5B%22a%22%5D+%5B%22little%22%5D+%5B%22grin%22%5D+%5B%22on%22%5D+%5B%22my%22%5D+%5B%22face%22%5D+%5B%5D+%5B%22but%22%5D+%5B%22I%22%5D+%5B%22have%22%5D+%5B%5D+%5B%22heard%22%5D+%5B%22myself%22%5D+%5B%22heartily%22%5D+%5B%22laughing%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': "A cheerful movie might bring a little grin on my face , but I have n't heard myself heartily laughing . ", 'right': '', 'complete_match': "A cheerful movie might bring a little grin on my face , but I have n't heard myself heartily laughing . ", 'testimony_id': 'HVT-44', 'shelfmark': ['Fortunoff Archive HVT-44'], 'token_start': 11064, 'token_end': 11085}


In [30]:
fragment_5 = {}
fragment_5['original_sentence'] = "there were these moments that there is some humor to it in a way, but I didn't laugh at the time."
fragment_5['label']= "(..) there were these moments that there is some humor to it in a way, but I didn't laugh at the time."
indices = find_sentence_id(fragment_5['original_sentence'])
fragment_5['start_sentence_index']=indices[0]
fragment_5['end_sentence_index']=indices[1]
fragment_5['media_offset'] = 0
fragment_5['media_index'] = 0
fragment_5['testimony_id'] = indices[2]
fragments['fragments'].append(fragment_5)

http://localhost:8080/blacklab-server-2.1.0/lts/hits?patt=%5B%22there%22%5D+%5B%22were%22%5D+%5B%22these%22%5D+%5B%22moments%22%5D+%5B%22that%22%5D+%5B%22there%22%5D+%5B%22is%22%5D+%5B%22some%22%5D+%5B%22humor%22%5D+%5B%22to%22%5D+%5B%22it%22%5D+%5B%22in%22%5D+%5B%22a%22%5D+%5B%22way%22%5D+%5B%5D+%5B%22but%22%5D+%5B%22I%22%5D+%5B%22did%22%5D+%5B%5D+%5B%22laugh%22%5D+%5B%22at%22%5D+%5B%22the%22%5D+%5B%22time%22%5D+%5B%5D&waitfortotal=true&outputformat=json&prettyprint=no&wordsaroundhit=0
{'left': '', 'match_word': "there were these moments that there is some humor to it in a way , but I did n't laugh at the time . ", 'right': '', 'complete_match': "there were these moments that there is some humor to it in a way , but I did n't laugh at the time . ", 'testimony_id': 'irn504849', 'shelfmark': ['USHMM RG-50.030*0356'], 'token_start': 5226, 'token_end': 5250}


In [31]:
add_testimonial_fragments(fragments)