In [2]:
import xmltodict, json, os, re, sys
import http.client, urllib.parse, uuid
import numpy as np
from googletrans import Translator
import time

In [3]:
os.chdir('/home/vvs/Documents/Telegram clustering')

In [3]:
def process_file(id, filename_prefix):
#     print(filename_prefix)
    
    f_annotation = open("%s.aa" % filename_prefix)
    annotations = xmltodict.parse(''.join(f_annotation.readlines()))["annotations"]
    units = annotations["unit"]
    if not 'relation' in annotations:
        relations = []
    else:
        relations = annotations["relation"]
    schema = annotations["schema"] if 'schema' in annotations else []

    f_discourse = open("%s.ac" % filename_prefix)
    discourse = f_discourse.readline()
    for i in range(len(discourse)):
        if ord(discourse[i]) >= 128: discourse = discourse[:i] + " " + discourse[i+1:]
    
    edus, buf_dialogues = {}, {}
    
    for item in units:
        _id = item["@id"]
        start = int(item["positioning"]["start"]["singlePosition"]["@index"])
        end = int(item["positioning"]["end"]["singlePosition"]["@index"])
        _type = item["characterisation"]["type"]
        if _type in ["Turn", "NonplayerTurn"]: continue
        elif _type == "Dialogue":
            buf_dialogues[_id] = {
                "start": start,
                "end": end,
                "edus": {},
                "cdus": {},
                "relations": []
            }
        else:
            edus[_id] = {
                "id": _id,
                "type": _type,
                "text": discourse[start:end],
                "start": start,
                "end": end
            }

    belong_to = {}
    for id_edu in edus:
        edu = edus[id_edu]
        found = False
        for id_dialogue in buf_dialogues:
            dialog = buf_dialogues[id_dialogue]
            if dialog["start"] <= edu["start"] and dialog["end"] >= edu["end"]:
                found = True
                dialog["edus"][id_edu] = edu
                belong_to[id_edu] = id_dialogue
                break
        if not found:
            raise Warning("Dialogue not found")
    
    if type(schema) != list: schema = [schema] 
    for item in schema:
        _id = item["@id"]
        _type = item["characterisation"]["type"]
        if item["positioning"] == None: continue
        
        cdu = []
        if "embedded-unit" in item["positioning"]:
            if type(item["positioning"]["embedded-unit"]) == list:
                cdu = [unit["@id"] for unit in item["positioning"]["embedded-unit"]]
            else:
                cdu = [item["positioning"]["embedded-unit"]["@id"]]
            for edu in cdu:
                if not edu in edus:
                    cdu.remove(edu)
        if "embedded-schema" in item["positioning"]:
            if type(item["positioning"]["embedded-schema"]) == list:
                cdu += [unit["@id"] for unit in item["positioning"]["embedded-schema"]]
            else:
                cdu += [item["positioning"]["embedded-schema"]["@id"]]
        belong_to[_id] = belong_to[cdu[0]]
        buf_dialogues[belong_to[_id]]["cdus"][_id] = cdu
        
    if type(relations) != list: relations = [relations]
    for item in relations:
        _id = item["@id"]
        x = item["positioning"]["term"][0]["@id"]
        y = item["positioning"]["term"][1]["@id"]
        _type = item["characterisation"]["type"]
        buf_dialogues[belong_to[x]]["relations"].append({
            "type": _type,
            "x": x,
            "y": y
        })
    dialogues = []
    for _id in buf_dialogues:
        buf_dialogues[_id]["id"] = id
        dialogues.append(buf_dialogues[_id])
    return dialogues
        
def process_dialogue(dialogue):
    has_incoming = {}
    
    for relation in dialogue["relations"]:
        has_incoming[relation["y"]] = True
       
    for _id in dialogue["edus"]:
        edu = dialogue["edus"][_id]
        if edu["type"] == "paragraph": continue
        
        for _id_para in dialogue["edus"]:
            def parse_speaker(text):
                return (text.split())[2]
            
            para = dialogue["edus"][_id_para]
            if para["type"] != "paragraph": continue
            if para["start"] <= edu["start"] and para["end"] >= edu["end"]:
                edu["speaker"] = parse_speaker(para["text"])
    
    idx = {}
    dialogue["edu_list"] = []
    
    for _id in dialogue["edus"]:
        if dialogue["edus"][_id]["type"] != "paragraph":
            dialogue["edu_list"].append(dialogue["edus"][_id])
    dialogue["edu_list"] = sorted(dialogue["edu_list"], key=lambda edu: edu["start"])
    
    for i in range(len(dialogue["edu_list"])):
        edu = dialogue["edu_list"][i]
        idx[edu["id"]] = i
        
#     for i, edu in enumerate(dialogue["edu_list"]):
#         print(i, edu["speaker"], ":", edu["text"])
       
#     print("===")

    for relation in dialogue["relations"]:
        def get_head(x):
            if x in dialogue["edus"]: return x
            else: 
                for du in dialogue["cdus"][x]:
                    if not du in has_incoming: return get_head(du)
                raise Warning("Can't find the recursive head")
            
        relation["x"] = idx[get_head(relation["x"])]
        relation["y"] = idx[get_head(relation["y"])]
        
    dialogue_cleaned = {
        "id": dialogue["id"],
        "edus": [],
        "relations": []
    }
    
    for edu in dialogue["edu_list"]:
        dialogue_cleaned["edus"].append({
            "speaker": edu["speaker"],
            "text": edu["text"]
        })
    for relation in dialogue["relations"]:
        dialogue_cleaned["relations"].append({
            "type": relation["type"],
            "x": relation["x"],
            "y": relation["y"]
        })
        
    return dialogue_cleaned

In [4]:
def clean_dialogues(imput_dir, output = None,return_dialog = False):
    dialogues = []
    dirs = os.listdir(imput_dir)

    for directory in dirs:
        dirs2 = os.listdir(f'{imput_dir}/{directory}')
        for directory2 in dirs2:
            tmp_path = f'{imput_dir}{directory}/{directory2}/'
            path = os.path.join(tmp_path, "discourse/GOLD")
            if os.path.exists(path):
                for filename in os.listdir(path):
                    if re.match("\S*.ac", filename):
                        _id = filename[:filename.find('_')]
                        dialogues += process_file(_id, os.path.join(path, filename[:filename.index(".")]))
     
    dialogues_cleaned = []
    for dialogue in dialogues:
        dialog = process_dialogue(dialogue)
        dialogues_cleaned.append(dialog)
    if output != None:
        fout = open(output, "w")
        fout.write(json.dumps(dialogues_cleaned))
        fout.close()
    print("%d dialogues" % len(dialogues_cleaned))
    if return_dialog:
        return dialogues_cleaned
    else:
        return None

In [5]:
def ru_translate(txt, attempt = 50, sourse_ln = 'en', destination_ln = 'ru', pause = None):
    try:
        translator = Translator()
        eng_word = re.compile(r'[\w]{2,}')
        output = translator.translate(txt,src=sourse_ln, dest= destination_ln).text
        counter = 0
        if eng_word.match(output):
            while translator.translate(output).src ==sourse_ln:
                output = translator.translate(output, src = sourse_ln, dest = destination_ln).text
                counter += 1
                if counter > attempt:
                    return output
        return output
    except:
        if pause == None:
            time.sleep(5)
            print(f'waiting 5 sec,  text:{txt}')
            pause = 60
        elif pause ==60:
            time.sleep(60)
            print(f'waiting 60 sec,  text:{txt}')
            pause = 120
        elif pause ==120:
            time.sleep(120)
            print(f'waiting 120 sec,  text:{txt}')
            pause = 555
        else:
            print(f'return {txt}')
            return txt
        output = ru_translate(txt, 
                              attempt = attempt, 
                              sourse_ln = sourse_ln, 
                              destination_ln = destination_ln, 
                              pause = pause)
        return output
            

def dialog_translator(dialog, 
                      saving_file = None, 
                      srs_ln = 'en', 
                      dest_ln = 'ru', 
                      translation_attempt = 50,
                      return_dialog = False):
    
    dialogues_test_cleaned_tr = []
    
    for index_game, game_test in enumerate(dialog):  
        dialogues_test_cleaned_tr.append(game_test)
        for index_fraze, fraze in enumerate(game_test['edus']):
            output = ru_translate(fraze['text'], translation_attempt, srs_ln, dest_ln)
            dialogues_test_cleaned_tr[index_game]['edus'][index_fraze]['text'] = output
            
    if saving_file != None:
        print('print json...')
        fout = open(saving_file, "w", encoding='utf-8')
        fout.write(json.dumps(dialogues_test_cleaned_tr,ensure_ascii=False,))
        fout.close()
        
    print("%d dialogues translated" % len(dialogues_test_cleaned_tr))
    if return_dialog:
        return dialogues_test_cleaned_tr
    else:
        return None

In [239]:
input_dir_train = './data/external/chinadataset/train/data/'
output_file_train = './data/processed/chinadataset/train_spect.json'
output_file_train_ru = './data/processed/chinadataset/train_spect_ru.json'
dialogues_cleaned_train = clean_dialogues(imput_dir = input_dir_train,  output = output_file_train, return_dialog = True)
dialogues_cleaned_train_ru = dialog_translator(dialog = dialogues_cleaned_train,
                                               saving_file = output_file_train_ru,
                                               return_dialog = True
                                              )


1086 dialogues
waiting 5 sec,  text:But it's not imperative.
print json...
1086 dialogues translated


In [238]:
input_dir_test = './data/external/chinadataset/TEST_spect-stac-linguistic-2018-03-21/'
output_file_test = './data/processed/chinadataset/new_test_spect.json'
output_file_test_ru = './data/processed/chinadataset/new_test_spect_ru.json'

dialogues_cleaned_test = clean_dialogues(imput_dir =input_dir_test, 
                                         output = output_file_test, 
                                         return_dialog = True)
dialogues_cleaned_test_ru = dialog_translator(dialog = dialogues_cleaned_test,
                                         saving_file = output_file_test_ru,
                                         return_dialog = True
                                        )


111 dialogues
print json...
111 dialogues translated
