# Analyse ORACC corpus

Functions provided in this notebook serve to create a referential database that may then be used to execute intertextual search. It partially reuses the script that has been created by Niek Veldhuis.

In [35]:
import os
import pandas as pd
import json
from gensim import corpora
import joblib
from collections import defaultdict
import editdistance
# from Levenshtein import distance
# NOTE: editdistance is slightly faster than Levenshtein --> applied here

In [3]:
ROOT_PATH = os.getcwd()

PROJECTS_DATA_PATH = os.path.join(ROOT_PATH, 'projectsdata')
CORPUS_PATH = os.path.join(ROOT_PATH, 'CORPUS')

## TODO list:

- create functions that extract data from JSON files
- create functions that provide fata suitable for different types of intertextuality detection (all with specified length of match by "word"):
    - precise intertextuality in cuneiform
    - precise intertextuality in normalised form
    - intertextuality by lemma
    - intertextuality by lemma with Levenshtein distance
    - intertextuality by lemma used within a text

- It will be needed to create a vectorized corpus

In [4]:
def parse_dict(input_dict:dict):
    for key, JSONobject in input_dict['cdl'].items():
        if 'cdl' in JSONobject:
            parse_dict(JSONobject)
        else:
            print('finally', JSONobject)

In [26]:
# def parsejson(text, parameters):
#     for JSONobject in text["cdl"]:
#         for JSONobject in JSONobject["cdl"]:
#             if "cdl" in JSONobject:
#                 parsejson(JSONobject, parameters)
#             if "label" in JSONobject:
#                 parameters["label"] = JSONobject['label']
#             if "f" in JSONobject:
#                 lemma = JSONobject["f"]
#                 lemma["id_word"] = JSONobject["ref"]
#                 lemma['label'] = parameters["label"]
#                 lemma["id_text"] = parameters["id_text"]
                
#                 try:
#                     text_forms.append(lemma['form'])
#                 except KeyError:
#                     text_forms.append('UNKNOWN')
#                     lemma['form'] = 'UNKNOWN'
#                 try:
#                     text_lemma.append(lemma['cf'])
#                 except KeyError:
#                     text_lemma.append('UNKNOWN')
#                     lemma['cf'] = 'UNKNOWN'
#                 try:
#                     text_normalised.append(lemma['norm'])
#                 except KeyError:
#                     text_normalised.append('UNKNOWN')
#                     lemma['norm'] = 'UNKNOWN'
                    
#                 lemm_l.append(lemma)
                
#             if "strict" in JSONobject and JSONobject["strict"] == "1":
#                 lemma = {key: JSONobject[key] for key in parameters["dollar_keys"]}
#                 lemma["id_word"] = JSONobject["ref"] + ".0"
#                 lemma["id_text"] = parameters["id_text"]
#                 #lemm_l.append(lemma)
#     return

In [5]:
def parsejson(text, parameters):
    text_forms = []
    text_lemma = []
    text_normalised = []
    lemm_l = []
    
    JSONobject = text['cdl'][0]
    
    i = 0
    while i <= 10:
        if type(JSONobject) == list:
            for json_dict in JSONobject:
                if 'cdl' in json_dict:
                    JSONobject = json_dict['cdl']
                else:
                    continue
                
        elif type(JSONobject) == dict:
            if 'cdl' in JSONobject:
                JSONobject = JSONobject['cdl']
            else:
                continue
            
        i += 1
        
    for inner_json in JSONobject:
        if 'label' in inner_json:
            parameters['label'] = inner_json['label']
        if 'f' in inner_json:
            lemma = inner_json['f']
            lemma['id_word'] = inner_json['ref']
            lemma['label'] = parameters['label']
            lemma['id_text'] = parameters['id_text']
            
            try:
                text_forms.append(lemma['form'])
            except KeyError:
                text_forms.append('UNKNOWN')
                lemma['form'] = 'UNKNOWN'
            try:
                text_lemma.append(lemma['cf'])
            except KeyError:
                text_lemma.append('UNKNOWN')
                lemma['cf'] = 'UNKNOWN'
            try:
                text_normalised.append(lemma['norm'])
            except KeyError:
                text_normalised.append('UNKNOWN')
                lemma['norm'] = 'UNKNOWN'
                
            lemm_l.append(lemma)
            
        if 'strict' in inner_json and inner_json['strict'] == '1':
            lemma = {key: inner_json[key] for key in parameters['dollar_keys']}
            lemma['id_word'] = inner_json['ref'] + '.0'
            lemma['id_text'] = parameters['id_text']
            #lemm_l.append(lemma)
    
    return {'text_forms': text_forms, 'text_lemma': text_lemma, 'text_normalised': text_normalised, 'lemm_l': lemm_l}

In [6]:
def extract_jsons_from_project(project_name:str):
    texts_with_errors = []
    
    # Check if the project has subprojects:
    subprojects = []
    analyse_project_corpusjson = False
    files_in_project = os.listdir(os.path.join(PROJECTS_DATA_PATH, project_name))
    for file_ in files_in_project:
        if os.path.isdir(os.path.join(PROJECTS_DATA_PATH, project_name, file_)) and file_ != 'corpusjson':
            #print(file_, 'is a subproject')
            subprojects.append(file_)
        elif file_ == 'corpusjson':
            # Check if the project itself has some json files in it.
            files_in_corpusjson = os.listdir(os.path.join(PROJECTS_DATA_PATH, project_name, file_))
            if len(files_in_corpusjson) > 0:
                #print(f'There are {len(files_in_corpusjson)} json files in the project directory.')
                analyse_project_corpusjson = True
                
    project_jsons = {}
    
    # Extract project data
    if analyse_project_corpusjson:
        PROJECT_JSONS_PATH = os.path.join(PROJECTS_DATA_PATH, project_name, 'corpusjson')
        jsons_in_sub = os.listdir(PROJECT_JSONS_PATH)
        print(f'There are {len(jsons_in_sub)} texts in {project_name} project')
        
        for json_file_name in jsons_in_sub:
            with open(os.path.join(PROJECT_JSONS_PATH, json_file_name), 'r', encoding='utf-8') as json_file:
                text_id = f'{project_name}/{json_file_name[:-5]}'
                try:
                    json_data = json.load(json_file)
                except:
                    texts_with_errors.append(text_id)
                project_jsons[text_id] = json_data
                
    # Extract subprojects data
    for subproject in subprojects:
        SUBPROJECT_JSONS_PATH = os.path.join(PROJECTS_DATA_PATH, project_name, subproject, 'corpusjson')
        jsons_in_sub = os.listdir(SUBPROJECT_JSONS_PATH)
        print(f'There are {len(jsons_in_sub)} texts in {project_name}-{subproject} subproject')
        
        for json_file_name in jsons_in_sub:
            with open(os.path.join(SUBPROJECT_JSONS_PATH, json_file_name), 'r', encoding='utf-8') as json_file:
                text_id = f'{project_name}/{subproject}/{json_file_name[:-5]}'
                try:
                    json_data = json.load(json_file)
                except:
                    texts_with_errors.append(text_id)
                project_jsons[text_id] = json_data
    
    # TODO: solve problem with Idrimi and similar: subprojects of subprojects
    return project_jsons, texts_with_errors

In [7]:
# nere_jsons, texts_with_errors = extract_jsons_from_project('nere')

# print(texts_with_errors)

There are 1 texts in nere project
[]


In [8]:
# parameters = {"label": None, "id_text": None, "dollar_keys" : ["extent", "scope", "state"]}

# for text_id in nere_jsons:
#     parameters["id_text"] = text_id
    
#     text_analysed = parsejson(nere_jsons[text_id], parameters=parameters) 
    
#     # try:
#     #     text_analysed = parsejson(nere_jsons[text_id], parameters=parameters) 
#     # except:
#     #     # TODO: find out the problems with these texts!
#     #     print('ERROR with a text:', text_id)
        
#     print(text_analysed)

{'text_forms': ['zi-im-ri-lim', 'ri-im', 'tu-qu₂-um-tim', 'lu-na-i-id', 'qu₂-ra-da-am', 'a-na', 'di-ri', 'šu-ma-aš-šu', 'lu-uš-ta-aš-ni', 'zi-im-ri-li-im', 'a-pil', '{m}ia-ah-du-li-im', 'a-ša-re-ed', 'ha-na', 'mu-ʾa₄-ab-bi-it', 'du-ur', 'na-ak-ri-im', 'x', 'x', 'x', 'x', 'x', 'x', 'me-er', 'lu-ul-li', 'še₂₀-me-e', 'uṣ-ṣi₂-ra', 'a-wa-ti', 'mu-ur-ta-di-id', 'za-e-ra-am', 'mu-ka-an-ni-iš', 'na-ak-ri-šu', 'eṭ-lum', 'pe₂-ti-iu₂', '{giš}ŠUKUR', 'mu-ut-x-x-ki', 'a-hi-iz', 'ma-tim', 'e-li', 'ma-a-tim', 'zi-im-ri-li-im', 'pe₂-ti-iu₂', '{giš}ŠUKUR', 'mu-ut-x-x-ki', 'a-hi-iz', 'ma-tim', 'e-li', 'ma-a-tim', 'i-ŠA₃', 'i-ŠA₃', 'šu-qu₂-ri', 'ib-bu-u₂', 'DINGIR{meš}', 'šum-šu', 'zi-ik-ru', '{d}a-nim', 'li-te₉-li-il', 'ri-im', 'ma-ti-šu', 'zi-im-ri-li-im', 'ib-bu-u₂', 'DINGIR{meš}', 'šum-šu', 'zi-ik-ru', '{d}a-nim', 'li-te₉-li-il', 'ri-im', 'ma-ti-šu', 'LUGAL', 'ša-pe₂-e-em', 'lu-ša-ar-bi-ma', 'na-ak-ri', '{d}EN.LIL₂', 'iš-ta-ka-an', 'za-ri-šu', 'ib-bi-ri-it', 'ib-bi-ri-it', 'ha-bu-ur', 'u₃', 'pu-ra-an

In [109]:
' Transforming corpus for intertextuality queries. '

# TODO: add 'UNKNOWN' to stop words, or all the unprocessed texts will get 100% hit!! --> no, do it so that in query document, 'UNKNOWN' becomes 'UNKNOWN_QUERY', so the window len is preserved.

class OraccProjectCorpus:
    def __init__(self, json_corpus):
        self.corpus = json_corpus
        self.texts =  [text_id for text_id in json_corpus]
        self.texts_data = [json_corpus[text_id] for text_id in json_corpus]
        self.size = len(json_corpus)
        
        analysed_corpus = self.AnalyseCorpus()
        
        self.Lemma = analysed_corpus['lemma']
        self.Forms = analysed_corpus['forms']
        self.Normalised = analysed_corpus['normalised']
        
        self.FullCorpus = analysed_corpus['corpus_data']
        
    def AnalyseCorpus(self) -> dict: 
        parameters = {'label': None, 'id_text': None, 'dollar_keys' : ['extent', 'scope', 'state']}
        
        corpus_data = {}
        
        full_corpus_forms = []
        full_corpus_lemma = []
        full_corpus_normalised = []
        
        for text_id in self.texts:
            parameters['id_text'] = text_id
            
            try:
                text_analysed = parsejson(self.corpus[text_id], parameters=parameters) 
            except:
                # TODO: find out the problems with these texts!
                print('ERROR with a text:', text_id)
            
            corpus_data[text_id] = text_analysed
            
        full_corpus_forms.append(text_analysed['text_forms'])
        full_corpus_lemma.append(text_analysed['text_lemma'])
        full_corpus_normalised.append(text_analysed['text_normalised'])
            
        return {'corpus_data': corpus_data, 'forms': full_corpus_forms, 'lemma': full_corpus_lemma, 'normalised': full_corpus_normalised}
    
        
class OraccCorpus():
    def __init__(self, input_projects:dict) -> None:
        self.projects = input_projects
        
        lemma_corpus = []
        form_corpus = []
        normalised_corpus = []
    
        for project_name, project_data in input_projects.items():
            print(project_name, 'is being processed for dictionary.')
            OPC_project = OraccProjectCorpus(json_corpus=project_data)
            for text in OPC_project.Lemma:
                lemma_corpus.append(text)
            
            for text in OPC_project.Forms:
                form_corpus.append(text)
                
            for text in OPC_project.Normalised:
                normalised_corpus.append(text)

        lemma_dictionary = corpora.Dictionary(lemma_corpus)
        forms_dictionary = corpora.Dictionary(form_corpus)
        normalised_dictionary = corpora.Dictionary(normalised_corpus)
        
        self.LemmaDict = lemma_dictionary
        self.FormsDict = forms_dictionary
        self.NormalisedDict = normalised_dictionary
        
        vectors = self.VectorizeOracc()
        
        self.VectLemma = vectors['vect_lemma']
        self.VectForms = vectors['vect_forms']
        self.VectNormalised = vectors['vect_norm']
        
        self.VectLemmaStream = vectors['vect_lemma_stream']
        self.VectFormsStream = vectors['vect_forms_stream']
        self.VectNormalisedStream = vectors['vect_norm_stream']
        
        self.TextsAssociatedToLemma = vectors['lemma_to_texts']
        self.TextsAssociatedToForms = vectors['forms_to_texts']
        self.TextsAssociatedToNormalised = vectors['norms_to_texts']
        
        
    def VectorizeOracc(self) -> dict:
        vectorized_texts_lemma = {}
        vectorized_texts_forms = {}
        vectorized_texts_normalised = {}
        
        vectorized_texts_lemma_stream = {}
        vectorized_texts_forms_stream = {}
        vectorized_texts_normalised_stream = {}
        
        for project_name, project_data in self.projects.items():
            print(project_name, 'is being vectorized.')
            OPC_project = OraccProjectCorpus(json_corpus=project_data)
            analysed_project = OPC_project.FullCorpus
            
            for text_id, text_data in analysed_project.items():
                if text_id not in vectorized_texts_lemma:
                    vectorized_texts_lemma[text_id] = self.LemmaDict.doc2bow(text_data['text_lemma'])
                    vectorized_texts_lemma_stream[text_id] = self.LemmaDict.doc2idx(text_data['text_lemma'])
                else:
                    print('ERROR', text_id, 'is duplicit')
                
                if text_id not in vectorized_texts_forms:
                    vectorized_texts_forms[text_id] = self.FormsDict.doc2bow(text_data['text_forms'])
                    vectorized_texts_forms_stream[text_id] = self.FormsDict.doc2idx(text_data['text_forms'])
                else:
                    print('ERROR', text_id, 'is duplicit')
                    
                if text_id not in vectorized_texts_normalised:
                    vectorized_texts_normalised[text_id] = self.NormalisedDict.doc2bow(text_data['text_normalised'])
                    vectorized_texts_normalised_stream[text_id] = self.NormalisedDict.doc2idx(text_data['text_normalised'])
                else:
                    print('ERROR', text_id, 'is duplicit')
                    
        # NOTE: we also make dictionary of tokens and list in which texts they appear. This helps to narrow down texts for possible intertextualities, and speeds up the process.
        lemma_to_texts = defaultdict(list)
        forms_to_texts = defaultdict(list)
        norms_to_texts = defaultdict(list)
                
        for text_id in vectorized_texts_lemma:
            for token in vectorized_texts_lemma[text_id]:
                token_id = token[0]
                lemma_to_texts[token_id].append(text_id)
            
            for token in vectorized_texts_forms[text_id]:
                token_id = token[0]           
                forms_to_texts[token_id].append(text_id)
            
            for token in vectorized_texts_normalised[text_id]:
                token_id = token[0]
                norms_to_texts[token_id].append(text_id)

        return {'vect_lemma': vectorized_texts_lemma, 
                'vect_forms': vectorized_texts_forms, 
                'vect_norm': vectorized_texts_normalised, 
                'vect_lemma_stream': vectorized_texts_lemma_stream,
                'vect_forms_stream': vectorized_texts_forms_stream,
                'vect_norm_stream': vectorized_texts_normalised_stream,
                'lemma_to_texts': lemma_to_texts,
                'forms_to_texts': forms_to_texts,
                'norms_to_texts': norms_to_texts}
        
    
    def save_full(self, save_name:str, save_path=CORPUS_PATH):
        joblib.dump(self, os.path.join(save_path, f'{save_name}.joblib'))
    
    
    def save_corpus(self, corpus_name:str, save_path=CORPUS_PATH):
        corpus = {'lemma': self.VectLemma,
                  'forms': self.VectForms,
                  'norms': self.VectNormalised,
                  'lemmaStream': self.VectLemmaStream,
                  'formStream': self.VectFormsStream,
                  'normsStream': self.VectNormalisedStream}
        
        joblib.dump(corpus, os.path.join(save_path, f'{corpus_name}.joblib'))
        
        
    def save_dictionaries(self, dictionary_name:str, save_path=CORPUS_PATH):
        dictionaries = {'lemma': self.LemmaDict,
                        'forms': self.FormsDict,
                        'norms': self.NormalisedDict}
        
        joblib.dump(dictionaries, os.path.join(save_path, f'{dictionary_name}.joblib'))
        
        
def load_corpus(corpus_name:str, load_path=CORPUS_PATH):
    return joblib.load(os.path.join(load_path, corpus_name))


def load_dictionary(dictionary_name:str, load_path=CORPUS_PATH):
    return joblib.load(os.path.join(load_path, dictionary_name))


def load_OraccCorpus(OraccCorpus_name:str, load_path=CORPUS_PATH) -> OraccCorpus:
    return joblib.load(os.path.join(load_path, OraccCorpus_name))

In [113]:
""" Transforming query text for analysis. """

def change_unknowns(input_list:list):
    return ['UKNOWN_QUERY' if x == 'UNKNOWN' or x == 'x' else x for x in input_list]


class ORACCQueryDocument():
    def __init__(self, input_text_json:dict, text_id:str, dictionary: corpora.Dictionary) -> None:
        self.dict = dictionary
        self.textID = text_id
        self.JSON = input_text_json
        
        analysed_corpus = self.AnalyseText()
        
        self.Lemma = analysed_corpus['lemma']
        self.Forms = analysed_corpus['forms']
        self.Normalised = analysed_corpus['normalised']
        
        vectors = self.VectorizeQuery()
        
        self.boLemma = vectors['QboLemma']
        self.boForms = vectors['QboForms']
        self.boNorms = vectors['QboNorms']
        
        self.LemmaStream = vectors['QLemmaStream']
        self.FormsStream = vectors['QFormsStream']
        self.NormsStream = vectors['QNormsStream']
        
    def AnalyseText(self) -> dict:
        parameters = {"label": None, "id_text": None, "dollar_keys" : ["extent", "scope", "state"], "id_text": self.textID}
        
        try:
            text_analysed = parsejson(self.JSON, parameters=parameters) 
        except:
            # TODO: find out the problems with these texts!
            print('ERROR with a text:', self.textID)
            
        text_forms = change_unknowns(text_analysed['text_forms'])
        text_lemma = change_unknowns(text_analysed['text_lemma'])
        text_normalised = change_unknowns(text_analysed['text_normalised'])
            
        return {'forms': text_forms, 'lemma': text_lemma, 'normalised': text_normalised}
    
    def VectorizeQuery(self) -> dict:
        query_boLemma = self.dict['lemma'].doc2bow(self.Lemma)
        query_boForms = self.dict['forms'].doc2bow(self.Forms)
        query_boNorms = self.dict['norms'].doc2bow(self.Normalised)
        
        query_LemmaStream = self.dict['lemma'].doc2idx(self.Lemma)
        query_FormsStream = self.dict['forms'].doc2idx(self.Forms)
        query_NormsStream = self.dict['norms'].doc2idx(self.Normalised)
        
        return {'QboLemma': query_boLemma,
                'QboForms': query_boForms,
                'QboNorms': query_boNorms,
                'QLemmaStream': query_LemmaStream,
                'QFormsStream': query_FormsStream,
                'QNormsStream': query_NormsStream}
            
            
class TEXTQueryDocument():
    # TODO: prepare this for the processed data as produced by the "Cuneiform Text Analyser" and add references.
    def __init__(self, input_text:list, mode='forms') -> None:
        """ :param mode: select from 'lemma', 'forms', or 'normalised'. By default, this is set to 'forms' as this is probably the most common input to receive. """
        pass


def select_texts_for_intertextualities(tokens_in_query, tokens_in_query_corpus, score_threshold=5):
    """ This function selects texts from the corpus that contain the tokens from the query text. """
    texts_score = defaultdict(int)
    
    for token_id in tokens_in_query:
        for text_id in tokens_in_query_corpus[token_id]:
            texts_score[text_id] += 1
        
    return [text_id for text_id, score in texts_score.items() if score >= score_threshold]


def parse_query_text(full_query: list, window_token_len=5):
    """ This function parses the query text into parts that are then used for the search of intertextualities. """
    if window_token_len > len(full_query):
        return [full_query]
    return [full_query[i:i+window_token_len] for i in range(len(full_query) - window_token_len + 1)]


def transform_vector_to_text(vector, dictionary: corpora.Dictionary):
    output_text = ''
    for token_id in vector:
        if token_id not in dictionary:
            output_text += 'UNKNOWN '
        else:
            output_text += dictionary[token_id] + ' '
            
    return output_text.strip()


def process_query_text_ORACC(query_document:ORACCQueryDocument, query_corpus:OraccCorpus, mode='lemma', window_token_len=5, tolerance=0):
    """
    ADD FUNCTION DESCRIPTION + PARAMS
    :param mode: select from 'lemma', 'forms', 'normalised'
    :param window_token_len: number of tokens to seek for intertextualities; if 0, then the general proximity of full query text is considered (comparison of tokens throughout corpus and query text, not considering token order)
    :param tolerance: number of tokens that may differ in the given window_token_len, but still be considered a match.
    """
    
    if window_token_len == 0:
        # TODO: create function that provides complete general comparison of the query text with the corpus, according to the mode.
        return None
    
    if mode == 'lemma':
        vectorised_query = query_document.LemmaStream
        vectorised_query_corpus = query_corpus.VectLemmaStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToLemma
        dictionary = query_corpus.LemmaDict
    elif mode == 'forms':
        vectorised_query = query_document.FormsStream
        vectorised_query_corpus = query_corpus.VectFormsStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToForms
        dictionary = query_corpus.FormsDict
    elif mode == 'normalised':
        vectorised_query = query_document.NormsStream
        vectorised_query_corpus = query_corpus.VectNormalisedStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToNormalised
        dictionary = query_corpus.NormalisedDict
        
    parsed_queries = parse_query_text(vectorised_query, window_token_len=window_token_len)    
    for query in parsed_queries:
        query_transformed = transform_vector_to_text(query, dictionary)
        tokens_in_query = list(set(query))
        texts_for_intertextualities = select_texts_for_intertextualities(tokens_in_query, tokens_in_query_corpus, score_threshold=window_token_len - tolerance)
        print(query, texts_for_intertextualities)
        
        for text_id in texts_for_intertextualities:
            text_vector = vectorised_query_corpus[text_id]
            parsed_text = parse_query_text(text_vector, window_token_len=window_token_len)
            for text_to_query_in in parsed_text:
                if editdistance.eval(query, text_to_query_in) <= tolerance:
                    intertext_found = transform_vector_to_text(text_to_query_in, dictionary)
                    print('Intertextuality found in text:', text_id, 'QUERY:', query_transformed, 'DISCOVERED PART:', intertext_found)
        
    return

def process_query_text_TEXT(query_document:TEXTQueryDocument, query_corpus:OraccCorpus, mode='lemma', window_token_len=5, tolerance=0):
    # TODO
    pass

In [114]:
nere_jsons, nere_texts_with_errors = extract_jsons_from_project('nere')
btto_jsons, btto_texts_with_errors = extract_jsons_from_project('btto')
dsst_jsons, dsst_texts_with_errors = extract_jsons_from_project('dsst')

There are 1 texts in nere project
There are 132 texts in btto project
There are 535 texts in dsst project


In [115]:
CorpusForQueries = OraccCorpus({'nere': nere_jsons, 'btto': btto_jsons, 'dsst': dsst_jsons})

nere is being processed for dictionary.
btto is being processed for dictionary.
dsst is being processed for dictionary.
nere is being vectorized.
btto is being vectorized.
dsst is being vectorized.


In [116]:
CorpusForQueries.save_corpus(corpus_name='nere_btto_dsst_corpus')
CorpusForQueries.save_dictionaries(dictionary_name='nere_btto_dsst_dict')

In [117]:
# loaded_corpus = load_corpus('nere_btto_dsst_corpus_Lemma.joblib')
loaded_dict = load_dictionary('nere_btto_dsst_dict.joblib')

In [118]:
test_query = ORACCQueryDocument(nere_jsons['nere/Q009326'], 'nere/Q009326', loaded_dict)

In [121]:
process_query_text_ORACC(test_query, CorpusForQueries, mode='normalised', window_token_len=3, tolerance=0)

[20, 277, 294] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: Zimrī-Lîm rīm tuqumtim DISCOVERED PART: Zimrī-Lîm rīm tuqumtim
[277, 294, 187] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: rīm tuqumtim lunaʾʾid DISCOVERED PART: rīm tuqumtim lunaʾʾid
[294, 187, 266] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: tuqumtim lunaʾʾid qurādam DISCOVERED PART: tuqumtim lunaʾʾid qurādam
[187, 266, 27] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: lunaʾʾid qurādam ana DISCOVERED PART: lunaʾʾid qurādam ana
[266, 27, 67] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: qurādam ana dīri DISCOVERED PART: qurādam ana dīri
[27, 67, 366] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: ana dīri šumaššu DISCOVERED PART: ana dīri šumaššu
[67, 366, 189] ['nere/Q009326']
Intertextuality found in text: nere/Q009326 QUERY: dīri šumaššu luštašni DISCOVERED PART: dīri šumaššu luštašni
[366