# Analyse ORACC corpus

Functions provided in this notebook serve to create a referential database that may then be used to execute intertextual search. It partially reuses the script that has been created by Niek Veldhuis.

In [14]:
import os
import pandas as pd
import json

In [15]:
ROOT_PATH = os.getcwd()

PROJECTS_DATA_PATH = os.path.join(os.getcwd(), 'projectsdata')

## TODO list:

- create functions that extract data from JSON files
- create functions that provide fata suitable for different types of intertextuality detection (all with specified length of match by "word"):
    - precise intertextuality in cuneiform
    - precise intertextuality in normalised form
    - intertextuality by lemma
    - intertextuality by lemma with Levenshtein distance
    - intertextuality by lemma used within a text

- It will be needed to create a vectorized corpus

In [80]:
def parse_dict(input_dict:dict):
    for key, JSONobject in input_dict['cdl'].items():
        if 'cdl' in JSONobject:
            parse_dict(JSONobject)
        else:
            print('finally', JSONobject)

In [81]:
test_dict = {'cdl': {'cdl': {'cdl': {'f': 'lal', 'b': '3'}}}}
parse_dict(test_dict)

finally lal
finally 3


In [116]:
def parsejson(text, parameters):
    for JSONobject in text["cdl"]:
        for JSONobject in JSONobject["cdl"]:
            if "cdl" in JSONobject:
                parsejson(JSONobject, parameters)
            if "label" in JSONobject:
                parameters["label"] = JSONobject['label']
            if "f" in JSONobject:
                lemma = JSONobject["f"]
                lemma["id_word"] = JSONobject["ref"]
                lemma['label'] = parameters["label"]
                lemma["id_text"] = parameters["id_text"]
                
                try:
                    text_words.append(lemma['form'])
                except KeyError:
                    text_words.append('UNKNOWN')
                    lemma['form'] = 'UNKNOWN'
                try:
                    text_lemma.append(lemma['cf'])
                except KeyError:
                    text_lemma.append('UNKWON')
                    lemma['cf'] = 'UNKWNON'
                try:
                    text_normalised.append(lemma['norm'])
                except KeyError:
                    text_normalised.append('UNKNOWN')
                    lemma['norm'] = 'UNKWNON'
                    
                lemm_l.append(lemma)
                
            if "strict" in JSONobject and JSONobject["strict"] == "1":
                lemma = {key: JSONobject[key] for key in parameters["dollar_keys"]}
                lemma["id_word"] = JSONobject["ref"] + ".0"
                lemma["id_text"] = parameters["id_text"]
                #lemm_l.append(lemma)
    return

In [134]:
def extract_jsons_from_project(project_name:str):
    texts_with_errors = []
    
    # Check if the project has subprojects:
    subprojects = []
    analyse_project_corpusjson = False
    files_in_project = os.listdir(os.path.join(PROJECTS_DATA_PATH, project_name))
    for file_ in files_in_project:
        if os.path.isdir(os.path.join(PROJECTS_DATA_PATH, project_name, file_)) and file_ != 'corpusjson':
            #print(file_, 'is a subproject')
            subprojects.append(file_)
        elif file_ == 'corpusjson':
            # Check if the project itself has some json files in it.
            files_in_corpusjson = os.listdir(os.path.join(PROJECTS_DATA_PATH, project_name, file_))
            if len(files_in_corpusjson) > 0:
                #print(f'There are {len(files_in_corpusjson)} json files in the project directory.')
                analyse_project_corpusjson = True
                
    project_jsons = {}
    
    # Extract project data
    if analyse_project_corpusjson:
        PROJECT_JSONS_PATH = os.path.join(PROJECTS_DATA_PATH, project_name, 'corpusjson')
        jsons_in_sub = os.listdir(PROJECT_JSONS_PATH)
        print(f'There are {len(jsons_in_sub)} texts in {project_name} project')
        
        for json_file_name in jsons_in_sub:
            with open(os.path.join(PROJECT_JSONS_PATH, json_file_name), 'r', encoding='utf-8') as json_file:
                text_id = f'{project_name}/{json_file_name[:-5]}'
                try:
                    json_data = json.load(json_file)
                except:
                    texts_with_errors.append(text_id)
                project_jsons[text_id] = json_data
                
    # Extract subprojects data
    for subproject in subprojects:
        SUBPROJECT_JSONS_PATH = os.path.join(PROJECTS_DATA_PATH, project_name, subproject, 'corpusjson')
        jsons_in_sub = os.listdir(SUBPROJECT_JSONS_PATH)
        print(f'There are {len(jsons_in_sub)} texts in {project_name}-{subproject} subproject')
        
        for json_file_name in jsons_in_sub:
            with open(os.path.join(SUBPROJECT_JSONS_PATH, json_file_name), 'r', encoding='utf-8') as json_file:
                text_id = f'{project_name}/{subproject}/{json_file_name[:-5]}'
                try:
                    json_data = json.load(json_file)
                except:
                    texts_with_errors.append(text_id)
                project_jsons[text_id] = json_data
    
    # TODO: solve problem with Idrimi and similar: subprojects of subprojects
    return project_jsons, texts_with_errors

In [135]:
nere_jsons, texts_with_errors = extract_jsons_from_project('ckst')

print(texts_with_errors)

There are 92 texts in ckst project
['ckst/P266494']


In [136]:
parameters = {"label": None, "id_text": None, "dollar_keys" : ["extent", "scope", "state"]}
for text_id in nere_jsons:
    parameters["id_text"] = text_id
    lemm_l = []
    text_words = []
    text_lemma = []
    text_normalised = []
    
    try:
        text_analysed = parsejson(nere_jsons[text_id], parameters=parameters) 
    except:
        # TODO: find out the problems with these texts!
        print('ERROR with a text:', text_id)
    
    print (text_words)
    print(text_lemma)
    print(text_normalised)
    
    # for lemma in lemm_l:
    #     #print(lemma.keys())
    #     print(lemma['form'])
    #     print(lemma['cf'])
    #     #print(lemma['gw'])
    #     #print(lemma['sense'])
    #     print(lemma['norm'])
        

['{d}nanna', 'lugal-a-ni-ir', 'ku-ri-gal-zu', 'šagina', '{d}en-lil₂', 'e₂-kiš-nu-ŋal₂', 'e₂', 'niŋ₂', 'ud', 'ul-li₂-a-ta', 'hu-mu-un-du₃', 'ki-bi-še₃', 'he₂-bi₂-gi₄']
['Nanna', 'lugal', 'Kurigalzu', 'šakkanak', 'Enlil', 'Ekišnuŋal', 'e', 'niŋ', 'ud', 'ul', 'du', 'ki', 'gi']
['UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN']
['{d}nanna', 'lugal-a-ni-ir', 'ku-ri-gal-zu', 'šagina', '{d}en-lil₂', 'e₂-kiš-nu-ŋal₂', 'e₂', 'niŋ₂', 'ud', 'ul-li₂-a-ta', 'šub-bu-de₃', 'hu-mu-du₃', 'ki-bi-še₃', 'he₂-bi₂-gi']
['Nanna', 'lugal', 'Kurigalzu', 'šakkanak', 'Enlil', 'Ekišnuŋal', 'e', 'niŋ', 'ud', 'ul', 'šub', 'du', 'ki', 'gi']
['UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN']
['{d}nanna', 'lugal-a-ni-ir', 'ku-ri-gal-zu', 'šagina', '{d}en-lil₂', 'e₂-kiš-nu-ŋal₂', 'e₂', 'ki-aŋ₂-a-ni', 'hu-mu-un-du₃']
['Nanna',