In [1]:
# for colab
# !pip install spacy_stanza
# !pip install ckip_transformers

# 1. import packages

In [2]:
import json
import pandas as pd
import tqdm
import pickle

# 2. Label Studio json format to input format

In [3]:
label_json_dir = "."
# label_json_dir = "/content" for colab
label_json_file = "label_news_50_test.json"

with open(f"{label_json_dir}/{label_json_file}", "r", encoding="utf8") as f:
    label_studio_json = json.load(f)

In [4]:
def get_paragraphs_list(label_studio_json) -> list[list[str]]:
    paragraphs_list = []
    for news_label in label_studio_json:
        paragraph_list = [i['text'] for i in news_label['data']['paragraphs']]
        paragraphs_list.append(paragraph_list)
    return paragraphs_list

def get_paragraphlabels_list(label_studio_json)->list[dict]:
    paragraphlabels_list = []

    for news_label in label_studio_json:
        paragraphlabels_dict = {}
        for annotation in news_label['annotations'][0]['result']:
            if annotation['type'] == "paragraphlabels":
                if annotation['value']['start'] in paragraphlabels_dict:
                    paragraphlabels_dict[annotation['value']['start']].append(annotation)
                else:
                    paragraphlabels_dict[annotation['value']['start']] = [annotation]
        paragraphlabels_list.append(paragraphlabels_dict)
    return paragraphlabels_list

def get_relations_list(label_studio_json)->list[list[dict]]:
    relations_list = []
    for news_label in label_studio_json:
        relations = []
        paragraphlabels_dict = {} 
        for annotation in news_label['annotations'][0]['result']:
            if annotation['type'] == "paragraphlabels":
                paragraphlabels_dict[annotation['id']]= annotation['value']['start']
            if annotation['type'] == "relation":
                annotation['from_id_paragraph_index'] = paragraphlabels_dict[annotation['from_id']]
                annotation['to_id_paragraph_index'] = paragraphlabels_dict[annotation['to_id']]
                relations.append(annotation)
        relation_dict = {}
        for relation in relations:
            if relation['from_id_paragraph_index'] in relation_dict:
                relation_dict[relation['from_id_paragraph_index']].append(relation)
            else:
                relation_dict[relation['from_id_paragraph_index']] = [relation]
        relations_list.append(relation_dict)
    return relations_list

def get_title_list(label_studio_json)->list[str]:
    title_list = []
    for news_label in label_studio_json:
        title_list.append(news_label['data']['title'])
    return title_list

def get_uid_list(label_studio_json)->list[str]:
    uid_list = []
    for news_label in label_studio_json:
        uid_list.append(news_label['data']['uid'])
    return uid_list

def get_url_list(label_studio_json)->list[str]:
    url_list = []
    for news_label in label_studio_json:
        url_list.append(news_label['data']['url'])
    return url_list

# 3 Pipeline step to add span and relation label result

In [5]:
def add_span_and_relation_anotation(doc, paragraph_index, paragraphlabels_dict, relations_dict):
    label_span_list_opinion = []
    label_span_list_coreference = []
    paragraph_index = str(paragraph_index)

    if paragraph_index in paragraphlabels_dict.keys():
        paragraphlabels_list = paragraphlabels_dict[paragraph_index]
        for paragraphlabel in paragraphlabels_list:
            label_type = paragraphlabel['value']['paragraphlabels'][0]
            label_id = paragraphlabel['id']
            startOffset = int(paragraphlabel['value']['startOffset'])
            endOffset = int(paragraphlabel['value']['endOffset'])

            label_span = doc.char_span(startOffset, endOffset, label=label_type, alignment_mode='expand')
            label_span._.label_id.append(label_id)
            label_span._.label_type.append(label_type)

            if "OPINION" in label_type:
                label_span_list_opinion.append(label_span)
            
            elif label_type in ["Coreference", "Pronoun"]:
                label_span_list_coreference.append(label_span)
            
            if paragraph_index in relations_dict.keys():
                for relation in relations_dict[paragraph_index]:
                    if relation['from_id'] in label_span._.label_id or relation['to_id'] in label_span._.label_id:

                        relation_label = {
                                            "from_id": relation['from_id'],
                                            "to_id": relation['to_id'],
                                            "from_id_paragraph_index": relation['from_id_paragraph_index'],
                                            "to_id_paragraph_index": relation['to_id_paragraph_index'],
                                            "relation_label": relation['labels'][0] if len(relation['labels']) > 0 else "MISSING",
                                            "direction":relation['direction'],
                                            "in_out": "out" if label_span._.label_id == relation['from_id'] else "in"
                                        }
                        
                        label_span._.relation_label.append(relation_label)

                
            for token in label_span:
                token._.label_id.append(label_id)
                token._.label_type.append(label_type)
                token._.relation_label.extend(label_span._.relation_label)
                
        doc.spans["opinion_label"] = label_span_list_opinion
        doc.spans["coreference_label"] = label_span_list_coreference

    return doc

In [6]:
paragraphs_list = get_paragraphs_list(label_studio_json)
relations_list = get_relations_list(label_studio_json)
paragraphlabels_list = get_paragraphlabels_list(label_studio_json)
tltle_list = get_title_list(label_studio_json)
uid_list = get_uid_list(label_studio_json)
url_list = get_url_list(label_studio_json)
assert len(paragraphs_list) == len(relations_list) == len(paragraphlabels_list) == len(tltle_list) == len(uid_list) == len(url_list)

all_docs = [] # for all news
print(len(paragraphs_list))

50


## 3.3 Define workflow

In [9]:
def run_work_flow(label_studio_json, spacy_pipeline, file_name, n_process=1):
    paragraphs_list = get_paragraphs_list(label_studio_json)
    relations_list = get_relations_list(label_studio_json)
    paragraphlabels_list = get_paragraphlabels_list(label_studio_json)
    tltle_list = get_title_list(label_studio_json)
    uid_list = get_uid_list(label_studio_json)
    url_list = get_url_list(label_studio_json)
    assert len(paragraphs_list) == len(relations_list) == len(paragraphlabels_list) == len(tltle_list) == len(uid_list) == len(url_list)
    
    all_docs = [] # for all news
    
    with tqdm.notebook.tqdm(total=len(paragraphs_list)) as pbar:
        for paragraphs, paragraphlabel, relations, title, news_uid, url in zip(paragraphs_list, paragraphlabels_list, relations_list, tltle_list, uid_list, url_list):
            docs = [] # for paragraphs in a news
            
            for doc in spacy_pipeline.pipe(paragraphs, n_process=n_process):
                docs.append(doc)

            for paragraph_idx, doc in enumerate(docs):
                if str(paragraph_idx) in paragraphlabel: # if there is no label, skip
                    doc = add_span_and_relation_anotation(doc, paragraph_idx, paragraphlabel, relations)
                
                doc._.news_uid = news_uid
                doc._.news_title = title
                doc._.news_url = url
                doc._.paragraph_index = paragraph_idx

            all_docs.append(docs)
            pbar.update(1)
    
    with open(f"{file_name}.pkl", "wb") as f:
        bytes_data = [[doc.to_bytes() for doc in docs] for docs in all_docs]
        pickle.dump(bytes_data, f)

    return all_docs

# 4. Pipeline with Stanza SGD tokenizer 

In [10]:
import pathlib
import sys

spacy_pipeline_parent_path = pathlib.Path.cwd().parent.parent.parent
sys.path.append(str(spacy_pipeline_parent_path))

from spacy_pipeline import pipeline_setup

pipeline_stanza_tokenizer = pipeline_setup.get_pipeline()

all_docs_stanza = run_work_flow(label_studio_json, pipeline_stanza_tokenizer, "label_news_50_test_docs_stanza", n_process=1)

2023-03-26 13:09:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-03-26 13:09:18 INFO: Loading these models for language: zh-hant (Traditional_Chinese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2023-03-26 13:09:18 INFO: Using device: cpu
2023-03-26 13:09:18 INFO: Loading: tokenize
2023-03-26 13:09:18 INFO: Loading: pos
2023-03-26 13:09:18 INFO: Loading: lemma
2023-03-26 13:09:18 INFO: Loading: depparse
2023-03-26 13:09:19 INFO: Done loading processors!


['ckip_pos', 'ckip_ner']
[1m

#   Component   Assigns   Requires   Scores   Retokenizes
-   ---------   -------   --------   ------   -----------
0   ckip_pos                                  False      
                                                         
1   ckip_ner                                  False      

[38;5;2m✔ No problems found.[0m
{'summary': {'ckip_pos': {'assigns': [], 'requires': [], 'scores': [], 'retokenizes': False}, 'ckip_ner': {'assigns': [], 'requires': [], 'scores': [], 'retokenizes': False}}, 'problems': {'ckip_pos': [], 'ckip_ner': []}, 'attrs': {}}


  0%|          | 0/50 [00:00<?, ?it/s]

  bytes_data = [[doc.to_bytes() for doc in docs] for docs in all_docs]


In [11]:
all_docs_stanza

[[高市議會今召開第四屆第一次臨時會，市長陳其邁循例率領市府團隊拜會各黨團、爭取對總預算的支持；不少議員對誰接高市副市長感到好奇。國民黨議員陸淑美公開舉薦民進黨議員鄭光峰，笑說這樣我們舉手比較不會輸！,
  高市議會第四屆第一次臨時會主要是審計今年度總預算，議長康裕成特別說明，現在才審查今年度總預算是「慣例」，主要是尊重新一屆議員，由最新民意來監督市政。,
  陳其邁上午循例帶隊拜會各黨團與正、副議長康裕成與曾俊傑，除了向第四屆議員致意，同時也專程向第三屆議長曾麗燕、副議長陸淑美致謝，感謝兩位在前兩年協助推動市政。,
  部分議員們則對誰接任副市長感到好奇，前副議長、國民黨議員陸淑美當面向陳其邁舉薦，應讓民進黨議員鄭光峰接副市長，這樣我們（藍營）舉手比較不會輸；議會最資深、民進黨議員李喬如則說，千萬不要找她當副市長，寧可監督市政，不想被監督。,
  高市議會說明，第四屆各黨團已成立，國民黨團28人、民進黨團32人、無黨團結聯盟4人。],
 [陸軍航特部「112-1空用武器射擊」昨日實施第二天操演，由陸航部隊配合想定執行訓練，針對靶區進行射擊，展現空中、地面精準打擊戰力。,
  根據軍聞社及青年日報報導，陸軍航空特戰指揮部15、16日持續在恆春三軍聯訓基地進行「112-1空用武器基礎射擊」，派遣AH-64E阿帕契攻擊直升機、OH-58D戰搜直升機、UH-60M黑鷹直升機對目標區射擊任務，射擊機砲及火箭彈等武器。,
  執行海神火箭及30鏈砲射擊任務的AH-64E攻擊直升機，不論在滯空或運動中進行射擊課目，火箭、砲彈均能精準擊中目標區。操演中也分別執行UH-60M通用直升機、特戰部隊射手及車裝T74機槍的實彈射擊；下午則由OH-58D戰搜直升機執行實彈射擊，達成驗證武器效能，強化地空整體作戰的任務目標。,
  火力強大且性能優異的AH-64E攻擊直升機，在上午的操演中，先後進行檢驗射擊、訓練射擊和鑑測射擊，並依據想定狀況的發布，發射海神火箭及30鏈砲，攻擊敵軍目標，訓練駕駛飛行技術、射手武器操作，及彼此的合作默契，經由縝密的規劃和管制，圓滿完成射擊任務。,
  601旅攻擊第1作戰隊處長林中校指出，基礎通用武器射擊是陸航部隊年度重要例行訓練，由於操演場地與駐地的氣候、風向有相當大的差異，飛行員在操演時必須保持航向和穩定。,
  「鏈砲射擊除了依SOP執行外，最重要的是射