# Analyse ORACC corpus

Functions provided in this notebook serve to create a referential database that may then be used to execute intertextual search. It partially reuses the script that has been created by Niek Veldhuis.

In [1]:
import os
import pandas as pd
import json
from gensim import corpora
import joblib
from collections import defaultdict
import editdistance
# from Levenshtein import distance
# NOTE: editdistance is slightly faster than Levenshtein --> applied here

In [2]:
ROOT_PATH = os.getcwd()

PROJECTS_DATA_PATH = os.path.join(ROOT_PATH, 'projectsdata')
CORPUS_PATH = os.path.join(ROOT_PATH, 'CORPUS')

## TODO list:

- create functions that extract data from JSON files
- create functions that provide fata suitable for different types of intertextuality detection (all with specified length of match by "word"):
    - precise intertextuality in cuneiform
    - precise intertextuality in normalised form
    - intertextuality by lemma
    - intertextuality by lemma with Levenshtein distance
    - intertextuality by lemma used within a text

- It will be needed to create a vectorized corpus

In [51]:
def parsejson(text, parameters, text_id=None):
    text_forms = []
    text_lemma = []
    text_normalised = []
    text_signs = []
    lemm_l = []
    
    try:
        JSONobject = text['cdl'][0]
    except KeyError:
        print(f'\t\t{text_id} >>> no text data found')
        return {'text_forms': [], 'text_lemma': [], 'text_normalised': [], 'text_signs': [], 'lemm_l': []}
    
    try:
        if JSONobject['node'] == 'd':
            print(f'\t\t{text_id} >>> no text data found')
            return {'text_forms': [], 'text_lemma': [], 'text_normalised': [], 'text_signs': [],'lemm_l': []}
        
    except KeyError:
        print(JSONobject, 'missing node')
        
    i = 0
    while i <= 10:
        if type(JSONobject) == list:
            for json_dict in JSONobject:
                if 'cdl' in json_dict:
                    JSONobject = json_dict['cdl']
                else:
                    continue
                
        elif type(JSONobject) == dict:
            if 'cdl' in JSONobject:
                JSONobject = JSONobject['cdl']
            else:
                continue
            
        i += 1
        
    
    for inner_json in JSONobject:
        if 'label' in inner_json:
            parameters['label'] = inner_json['label']
        if 'f' in inner_json:
            lemma = inner_json['f']
            lemma['id_word'] = inner_json['ref']
            lemma['label'] = parameters['label']
            lemma['id_text'] = parameters['id_text']
            
            try:
                text_forms.append(lemma['form'])
            except KeyError:
                text_forms.append('UNKNOWN')
                lemma['form'] = 'UNKNOWN'
            try:
                text_lemma.append(lemma['cf'])
            except KeyError:
                text_lemma.append('UNKNOWN')
                lemma['cf'] = 'UNKNOWN'
            try:
                text_normalised.append(lemma['norm'])
            except KeyError:
                text_normalised.append('UNKNOWN')
                lemma['norm'] = 'UNKNOWN'
                
            try:
                for sign in lemma['gdl']:
                    try:
                        s = sign['v']
                        text_signs.append(s)
                    except KeyError:
                        text_signs.append('UNKNOWN')
                        
            except KeyError:
                text_signs.append('UNKNOWN')                    
                
            lemm_l.append(lemma)
            
        if 'strict' in inner_json and inner_json['strict'] == '1':
            lemma = {key: inner_json[key] for key in parameters['dollar_keys']}
            lemma['id_word'] = inner_json['ref'] + '.0'
            lemma['id_text'] = parameters['id_text']
            #lemm_l.append(lemma)
    
    return {'text_forms': text_forms, 'text_lemma': text_lemma, 'text_normalised': text_normalised, 'text_signs': text_signs, 'lemm_l': lemm_l}

In [52]:
def find_corpusjson_folders(start_path):
    corpusjson_paths = []
    
    for root, dirs, files in os.walk(start_path):
        if "corpusjson" in dirs:
            relative_path = os.path.relpath(os.path.join(root, "corpusjson"), start_path)
            corpusjson_paths.append(relative_path.replace("\\", "/"))
            dirs.remove("corpusjson")  # Prevent recursion to the corpusjson folder
    
    return corpusjson_paths


def extract_jsons_from_project(project_name:str):
    texts_with_errors = []
    
    # Find corpusjson folders in the project:
    corpusjson_folders = find_corpusjson_folders(os.path.join(PROJECTS_DATA_PATH, project_name))
                
    project_jsons = {}
    
    for corpusjson_folder in corpusjson_folders:
        full_path = os.path.join(PROJECTS_DATA_PATH, project_name, corpusjson_folder)
        text_id_prefix = f'{project_name}/{corpusjson_folder[:-11]}'
        files_in_folder = os.listdir(full_path)
        if len(files_in_folder) > 0:
            print(f'Found {len(files_in_folder)} files in {project_name}/{corpusjson_folder[:-11]} project')
            
            for json_file_name in files_in_folder:
                with open(os.path.join(full_path, json_file_name), 'r', encoding='utf-8') as json_file:
                    text_id = f'{text_id_prefix}/{json_file_name[:-5]}'.replace('//', '/') # in case there are no subprojects, there are double slashes --> remove them
                    #print(text_id)
                    try:
                        json_data = json.load(json_file)
                        project_jsons[text_id] = json_data
                    except:
                        texts_with_errors.append(text_id)
                    
    return project_jsons, texts_with_errors

In [61]:
' Transforming corpus for intertextuality queries. '

class OraccProjectCorpus:
    def __init__(self, json_corpus):
        self.corpus = json_corpus
        self.texts =  [text_id for text_id in json_corpus]
        self.texts_data = [json_corpus[text_id] for text_id in json_corpus]
        self.size = len(json_corpus)
        
        analysed_corpus = self.AnalyseCorpus()
        
        self.Lemma = analysed_corpus['lemma']
        self.Forms = analysed_corpus['forms']
        self.Normalised = analysed_corpus['normalised']
        self.Signs = analysed_corpus['signs']
        
        self.FullCorpus = analysed_corpus['corpus_data']
        
    def AnalyseCorpus(self) -> dict: 
        parameters = {'label': None, 'id_text': None, 'dollar_keys' : ['extent', 'scope', 'state']}
        
        corpus_data = {}
        
        full_corpus_forms = []
        full_corpus_lemma = []
        full_corpus_normalised = []
        full_corpus_signs = []
        
        print('\tAnalyzing texts in the corpus.', self.size, 'texts to be processed.')
        
        for text_id in self.texts:
            parameters['id_text'] = text_id
            
            try:
                if text_id == 'cams/gkab/P338333':
                    print(text_id, 'has not been processed but it then got stuck!')
                    text_analysed = {'text_forms': [], 'text_lemma': [], 'text_normalised': [], 'text_signs': [], 'lemm_l': []}
                else:
                    text_analysed = parsejson(self.corpus[text_id], parameters=parameters, text_id=text_id)
            except:
                # TODO: find out the problems with these texts!
                print('ERROR with a text:', text_id)
            
            corpus_data[text_id] = text_analysed
            
            full_corpus_forms.append(text_analysed['text_forms'])
            full_corpus_lemma.append(text_analysed['text_lemma'])
            full_corpus_normalised.append(text_analysed['text_normalised'])
            full_corpus_signs.append(text_analysed['text_signs'])
            
        return {'corpus_data': corpus_data, 'forms': full_corpus_forms, 'lemma': full_corpus_lemma, 'normalised': full_corpus_normalised, 'signs': full_corpus_signs}
    
        
class OraccCorpus():
    def __init__(self, input_projects:dict) -> None:
        self.projects = input_projects
        
        lemma_corpus = []
        form_corpus = []
        normalised_corpus = []
        signs_corpus = []
    
        for project_name, project_data in input_projects.items():
            print(project_name, 'is being processed for dictionary.')
            OPC_project = OraccProjectCorpus(json_corpus=project_data)
            for text in OPC_project.Lemma:
                lemma_corpus.append(text)
            
            for text in OPC_project.Forms:
                form_corpus.append(text)
                
            for text in OPC_project.Normalised:
                normalised_corpus.append(text)
                
            for text in OPC_project.Signs:
                signs_corpus.append(text)

        lemma_dictionary = corpora.Dictionary(lemma_corpus)
        forms_dictionary = corpora.Dictionary(form_corpus)
        normalised_dictionary = corpora.Dictionary(normalised_corpus)
        signs_dictionary = corpora.Dictionary(signs_corpus)
        
        self.LemmaDict = lemma_dictionary
        self.FormsDict = forms_dictionary
        self.NormalisedDict = normalised_dictionary
        self.SignsDict = signs_dictionary
        
        vectors = self.VectorizeOracc()
        
        self.VectLemma = vectors['vect_lemma']
        self.VectForms = vectors['vect_forms']
        self.VectNormalised = vectors['vect_norm']
        self.VectSigns = vectors['vect_signs']
        
        self.VectLemmaStream = vectors['vect_lemma_stream']
        self.VectFormsStream = vectors['vect_forms_stream']
        self.VectNormalisedStream = vectors['vect_norm_stream']
        self.VectSignsStream = vectors['vect_signs_stream']
        
        self.TextsAssociatedToLemma = vectors['lemma_to_texts']
        self.TextsAssociatedToForms = vectors['forms_to_texts']
        self.TextsAssociatedToNormalised = vectors['norms_to_texts']
        self.TextsAssociatedToSigns = vectors['signs_to_texts']
        
        
    def VectorizeOracc(self) -> dict:
        vectorized_texts_lemma = {}
        vectorized_texts_forms = {}
        vectorized_texts_normalised = {}
        vectorized_texts_signs = {}
        
        vectorized_texts_lemma_stream = {}
        vectorized_texts_forms_stream = {}
        vectorized_texts_normalised_stream = {}
        vectorized_texts_signs_stream = {}
        
        for project_name, project_data in self.projects.items():
            print(project_name, 'is being vectorized.')
            OPC_project = OraccProjectCorpus(json_corpus=project_data)
            analysed_project = OPC_project.FullCorpus
            
            for text_id, text_data in analysed_project.items():
                if text_id not in vectorized_texts_lemma:
                    vectorized_texts_lemma[text_id] = self.LemmaDict.doc2bow(text_data['text_lemma'])
                    vectorized_texts_lemma_stream[text_id] = self.LemmaDict.doc2idx(text_data['text_lemma'])
                else:
                    print('ERROR', text_id, 'is duplicit')
                
                if text_id not in vectorized_texts_forms:
                    vectorized_texts_forms[text_id] = self.FormsDict.doc2bow(text_data['text_forms'])
                    vectorized_texts_forms_stream[text_id] = self.FormsDict.doc2idx(text_data['text_forms'])
                else:
                    print('ERROR', text_id, 'is duplicit')
                    
                if text_id not in vectorized_texts_normalised:
                    vectorized_texts_normalised[text_id] = self.NormalisedDict.doc2bow(text_data['text_normalised'])
                    vectorized_texts_normalised_stream[text_id] = self.NormalisedDict.doc2idx(text_data['text_normalised'])
                else:
                    print('ERROR', text_id, 'is duplicit')
                    
                if text_id not in vectorized_texts_signs:
                    vectorized_texts_signs[text_id] = self.SignsDict.doc2bow(text_data['text_signs'])
                    vectorized_texts_signs_stream[text_id] = self.SignsDict.doc2idx(text_data['text_signs'])
                    
        # NOTE: we also make dictionary of tokens and list in which texts they appear. This helps to narrow down texts for possible intertextualities, and speeds up the process.
        lemma_to_texts = defaultdict(list)
        forms_to_texts = defaultdict(list)
        norms_to_texts = defaultdict(list)
        signs_to_texts = defaultdict(list)
                
        for text_id in vectorized_texts_lemma:
            for token in vectorized_texts_lemma[text_id]:
                token_id = token[0]
                lemma_to_texts[token_id].append(text_id)
            
            for token in vectorized_texts_forms[text_id]:
                token_id = token[0]           
                forms_to_texts[token_id].append(text_id)
            
            for token in vectorized_texts_normalised[text_id]:
                token_id = token[0]
                norms_to_texts[token_id].append(text_id)
                
            for token in vectorized_texts_signs[text_id]:
                token_id = token[0]
                signs_to_texts[token_id].append(text_id)

        return {'vect_lemma': vectorized_texts_lemma, 
                'vect_forms': vectorized_texts_forms, 
                'vect_norm': vectorized_texts_normalised,
                'vect_signs': vectorized_texts_signs,
                'vect_lemma_stream': vectorized_texts_lemma_stream,
                'vect_forms_stream': vectorized_texts_forms_stream,
                'vect_norm_stream': vectorized_texts_normalised_stream,
                'vect_signs_stream': vectorized_texts_signs_stream,
                'lemma_to_texts': lemma_to_texts,
                'forms_to_texts': forms_to_texts,
                'norms_to_texts': norms_to_texts,
                'signs_to_texts': signs_to_texts}
        
    
    def save_full(self, save_name:str, save_path=CORPUS_PATH):
        joblib.dump(self, os.path.join(save_path, f'{save_name}.joblib'))
    
    
    def save_corpus(self, corpus_name:str, save_path=CORPUS_PATH):
        corpus = {'lemma': self.VectLemma,
                  'forms': self.VectForms,
                  'norms': self.VectNormalised,
                  'signs': self.VectSigns,
                  'lemmaStream': self.VectLemmaStream,
                  'formStream': self.VectFormsStream,
                  'normsStream': self.VectNormalisedStream,
                  'signsStream': self.VectSignsStream,}
        
        joblib.dump(corpus, os.path.join(save_path, f'{corpus_name}.joblib'))
        
        
    def save_dictionaries(self, dictionary_name:str, save_path=CORPUS_PATH):
        dictionaries = {'lemma': self.LemmaDict,
                        'forms': self.FormsDict,
                        'norms': self.NormalisedDict,
                        'signs': self.SignsDict}
        
        joblib.dump(dictionaries, os.path.join(save_path, f'{dictionary_name}.joblib'))
        
        
def load_corpus(corpus_name:str, load_path=CORPUS_PATH):
    return joblib.load(os.path.join(load_path, f'{corpus_name}.joblib'))


def load_dictionary(dictionary_name:str, load_path=CORPUS_PATH):
    return joblib.load(os.path.join(load_path, f'{dictionary_name}.joblib'))


def load_OraccCorpus(OraccCorpus_name:str, load_path=CORPUS_PATH) -> OraccCorpus:
    return joblib.load(os.path.join(load_path, f'{OraccCorpus_name}.joblib'))

def save_json_corpus(json_corpus:dict, save_name:str, save_path=CORPUS_PATH):
    joblib.dump(json_corpus, os.path.join(save_path, f'{save_name}.joblib'))
    
def load_json_corpus(json_corpus_name:str, load_path=CORPUS_PATH) -> dict:
    return joblib.load(os.path.join(load_path, f'{json_corpus_name}.joblib'))

In [54]:
""" Transforming query text for analysis. """

def change_unknowns(input_list:list):
    return ['UKNOWN_QUERY' if x == 'UNKNOWN' or x == 'x' else x for x in input_list]


class ORACCQueryDocument():
    def __init__(self, input_text_json:dict, text_id:str, dictionary: corpora.Dictionary) -> None:
        self.dict = dictionary
        self.textID = text_id
        self.JSON = input_text_json
        
        analysed_corpus = self.AnalyseText()
        
        self.Lemma = analysed_corpus['lemma']
        self.Forms = analysed_corpus['forms']
        self.Normalised = analysed_corpus['normalised']
        self.Signs = analysed_corpus['signs']
        
        vectors = self.VectorizeQuery()
        
        self.boLemma = vectors['QboLemma']
        self.boForms = vectors['QboForms']
        self.boNorms = vectors['QboNorms']
        self.boSigns = vectors['QboSigns']
        
        self.LemmaStream = vectors['QLemmaStream']
        self.FormsStream = vectors['QFormsStream']
        self.NormsStream = vectors['QNormsStream']
        self.SignsStream = vectors['QSignsStream']
        
    def AnalyseText(self) -> dict:
        parameters = {"label": None, "id_text": None, "dollar_keys" : ["extent", "scope", "state"], "id_text": self.textID}
        
        try:
            text_analysed = parsejson(self.JSON, parameters=parameters) 
        except:
            # TODO: find out the problems with these texts!
            print('ERROR with a text:', self.textID)
            
        text_forms = change_unknowns(text_analysed['text_forms'])
        text_lemma = change_unknowns(text_analysed['text_lemma'])
        text_normalised = change_unknowns(text_analysed['text_normalised'])
        text_signs = change_unknowns(text_analysed['text_signs'])
            
        return {'forms': text_forms, 'lemma': text_lemma, 'normalised': text_normalised, 'signs': text_signs}
    
    def VectorizeQuery(self) -> dict:
        query_boLemma = self.dict['lemma'].doc2bow(self.Lemma)
        query_boForms = self.dict['forms'].doc2bow(self.Forms)
        query_boNorms = self.dict['norms'].doc2bow(self.Normalised)
        query_boSigns = self.dict['signs'].doc2bow(self.Signs)
        
        query_LemmaStream = self.dict['lemma'].doc2idx(self.Lemma)
        query_FormsStream = self.dict['forms'].doc2idx(self.Forms)
        query_NormsStream = self.dict['norms'].doc2idx(self.Normalised)
        query_SignsStream = self.dict['signs'].doc2idx(self.Signs)
        
        return {'QboLemma': query_boLemma,
                'QboForms': query_boForms,
                'QboNorms': query_boNorms,
                'QboSigns': query_boSigns,
                'QLemmaStream': query_LemmaStream,
                'QFormsStream': query_FormsStream,
                'QNormsStream': query_NormsStream,
                'QSignsStream': query_SignsStream}
            
            
class TEXTQueryDocument():
    # TODO: prepare this for the processed data as produced by the "Cuneiform Text Analyser" and add references.
    def __init__(self, input_text:list, mode='forms') -> None:
        """ :param mode: select from 'lemma', 'forms', 'normalised', or 'signs'. By default, this is set to 'forms' as this is probably the most common input to receive. """
        pass


def select_texts_for_intertextualities(tokens_in_query, tokens_in_query_corpus, score_threshold=5):
    """ This function selects texts from the corpus that contain the tokens from the query text. """
    texts_score = defaultdict(int)
    
    for token_id in tokens_in_query:
        for text_id in tokens_in_query_corpus[token_id]:
            texts_score[text_id] += 1
        
    return [text_id for text_id, score in texts_score.items() if score >= score_threshold]


def parse_query_text(full_query: list, window_token_len=5):
    """ This function parses the query text into parts that are then used for the search of intertextualities. """
    if window_token_len > len(full_query):
        return [full_query]
    return [full_query[i:i+window_token_len] for i in range(len(full_query) - window_token_len + 1)]


def transform_vector_to_text(vector, dictionary: corpora.Dictionary):
    output_text = ''
    for token_id in vector:
        if token_id not in dictionary:
            output_text += 'UNKNOWN '
        else:
            output_text += dictionary[token_id] + ' '
            
    return output_text.strip()


def process_query_text_ORACC(query_document:ORACCQueryDocument, query_corpus:OraccCorpus, mode='lemma', window_token_len=5, tolerance=0, ignore_self=True):
    """
    ADD FUNCTION DESCRIPTION + PARAMS
    :param mode: select from 'lemma', 'forms', 'normalised', 'signs'
    :param window_token_len: number of tokens to seek for intertextualities; if 0, then the general proximity of full query text is considered (comparison of tokens throughout corpus and query text, not considering token order)
    :param tolerance: number of tokens that may differ in the given window_token_len, but still be considered a match.
    """
    
    texts_with_intertextualities = []
    detailed_intertextualities = defaultdict(list)
    
    if window_token_len == 0:
        # TODO: create function that provides complete general comparison of the query text with the corpus, according to the mode.
        return None
    
    if mode == 'lemma':
        vectorised_query = query_document.LemmaStream
        vectorised_query_corpus = query_corpus.VectLemmaStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToLemma
        dictionary = query_corpus.LemmaDict
    elif mode == 'forms':
        vectorised_query = query_document.FormsStream
        vectorised_query_corpus = query_corpus.VectFormsStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToForms
        dictionary = query_corpus.FormsDict
    elif mode == 'normalised':
        vectorised_query = query_document.NormsStream
        vectorised_query_corpus = query_corpus.VectNormalisedStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToNormalised
        dictionary = query_corpus.NormalisedDict
    elif mode == 'signs':
        vectorised_query = query_document.SignsStream
        vectorised_query_corpus = query_corpus.VectSignsStream
        tokens_in_query_corpus = query_corpus.TextsAssociatedToSigns
        dictionary = query_corpus.SignsDict
        
    parsed_queries = parse_query_text(vectorised_query, window_token_len=window_token_len)    
    for query in parsed_queries:
        query_transformed = transform_vector_to_text(query, dictionary)
        tokens_in_query = list(set(query))
        texts_for_intertextualities = select_texts_for_intertextualities(tokens_in_query, tokens_in_query_corpus, score_threshold=window_token_len - tolerance)
        # print(query, texts_for_intertextualities)
        
        for text_id in texts_for_intertextualities:
            if ignore_self and text_id == query_document.textID:
                continue
            
            else:
                text_vector = vectorised_query_corpus[text_id]
                parsed_text = parse_query_text(text_vector, window_token_len=window_token_len)
                parse_position_start = -1
                for text_to_query_in in parsed_text:
                    parse_position_start += 1
                    if editdistance.eval(query, text_to_query_in) <= tolerance:
                        intertext_found = transform_vector_to_text(text_to_query_in, dictionary)
                        texts_with_intertextualities.append(text_id)
                        detailed_intertextualities[text_id].append({'query': query_transformed, 'detected_intertextuality': intertext_found, 'position': parse_position_start})
                        
                        print('Intertextuality found in text:', text_id, 'QUERY:', query_transformed, 'DISCOVERED PART:', intertext_found)
                        
    return list(set(texts_with_intertextualities)), detailed_intertextualities

def process_query_text_TEXT(query_document:TEXTQueryDocument, query_corpus:OraccCorpus, mode='lemma', window_token_len=5, tolerance=0):
    # TODO
    pass

In [14]:
all_project_jsons = {}
projects_texts_with_errors = {}

for project_name in os.listdir(PROJECTS_DATA_PATH):
    project_jsons, texts_with_errors = extract_jsons_from_project(project_name)
    all_project_jsons[project_name] = project_jsons
    projects_texts_with_errors[project_name] = texts_with_errors
    
save_json_corpus(all_project_jsons, 'all_project_jsons')
save_json_corpus(projects_texts_with_errors, 'all_projects_texts_with_errors')

Found 89 files in adsd/adart1 project
Found 162 files in adsd/adart2 project
Found 156 files in adsd/adart3 project
Found 105 files in adsd/adart5 project
Found 180 files in adsd/adart6 project
Found 1 files in aemw/alalakh/idrimi project
Found 305 files in aemw/amarna project
Found 32 files in akklove/ project
Found 175 files in ario/ project
Found 160 files in asbp/ninmed project
Found 44 files in atae/burmarina project
Found 408 files in atae/durkatlimmu project
Found 212 files in atae/guzana project
Found 19 files in atae/huzirina project
Found 30 files in atae/imgurenlil project
Found 74 files in atae/mallanate project
Found 46 files in atae/marqasu project
Found 2 files in atae/samal project
Found 22 files in atae/szibaniba project
Found 22 files in atae/tilbarsip project
Found 33 files in atae/tuszhan project
Found 225 files in babcity/ project
Found 229 files in blms/ project
Found 224 files in borsippa/ project
Found 132 files in btto/ project
Found 3 files in cams/barutu proj

In [55]:
all_project_jsons = load_json_corpus('all_project_jsons')

In [62]:
CorpusForQueries = OraccCorpus(all_project_jsons)

adsd is being processed for dictionary.
	Analyzing texts in the corpus. 692 texts to be processed.
aemw is being processed for dictionary.
	Analyzing texts in the corpus. 306 texts to be processed.
akklove is being processed for dictionary.
	Analyzing texts in the corpus. 30 texts to be processed.
amgg is being processed for dictionary.
	Analyzing texts in the corpus. 0 texts to be processed.
ario is being processed for dictionary.
	Analyzing texts in the corpus. 173 texts to be processed.
armep is being processed for dictionary.
	Analyzing texts in the corpus. 0 texts to be processed.
arrim is being processed for dictionary.
	Analyzing texts in the corpus. 0 texts to be processed.
asbp is being processed for dictionary.
	Analyzing texts in the corpus. 159 texts to be processed.
atae is being processed for dictionary.
	Analyzing texts in the corpus. 912 texts to be processed.
babcity is being processed for dictionary.
	Analyzing texts in the corpus. 224 texts to be processed.
blms is b

In [63]:
CorpusForQueries.save_corpus(corpus_name='full_ORACC_corpus')
CorpusForQueries.save_dictionaries(dictionary_name='full_ORACC_dict')

In [64]:
# loaded_corpus = load_corpus('nere_btto_dsst_corpus_Lemma')
loaded_dict = load_dictionary('full_ORACC_dict')

## Testing intertextuality detection

TODO:
- signs without numbers (en == én == en3)
- signs with MZL values (DINGIR == an)
- "normalised signs" - i.e., šim-ma --> šimma; si-im --> sim (keep doubble consonants, ignore double vowels)
- "normalised stream" - do not compare the stream of tokens, but then connect it into string and use distance on the whole stream.


In [65]:
nere_jsons, nere_errors = extract_jsons_from_project('nere')

Found 1 files in nere/ project


In [66]:
test_query = ORACCQueryDocument(nere_jsons['nere/Q009326'], 'nere/Q009326', loaded_dict)

In [69]:
texts_with_intertextualities, detailed_intertextualities = process_query_text_ORACC(test_query, CorpusForQueries, mode='signs', window_token_len=10 , tolerance=1)

In [70]:
print(texts_with_intertextualities)

[]
