In [1]:
import spacy
from spacy.matcher import DependencyMatcher
from spacy.tokens import Token, Span, SpanGroup, Doc
from spacy.language import Language

import spacy_stanza

import json

import pandas as pd

from tqdm.notebook import tqdm

import pickle

In [2]:
import torch

if torch.cuda.is_available():
    spacy.prefer_gpu()
    has_gpu = True

else:
    has_gpu = False

print(f"has gpu: {has_gpu}")

has gpu: False


In [3]:
Token.set_extension('label_id', default=[], force=True)
Token.set_extension('label_type', default=[], force=True)
Token.set_extension('relation_label', default=[], force=True)

Span.set_extension('label_id', default=[], force=True)
Span.set_extension('label_type', default=[], force=True)

Span.set_extension('relation_label', default=[], force=True)

Doc.set_extension('news_uid', default=None, force=True)
Doc.set_extension('news_title', default=None, force=True)
Doc.set_extension('news_url', default=None, force=True)
Doc.set_extension('paragraph_index', default=None, force=True)

In [4]:
def error_handler(proc_name, proc, docs, e):
    print(f"An error occurred when applying component {proc_name}.")
    print(f"Docs: {docs}")
    print(f"Proc: {proc}")
    print(f"Error: {e}")
    print()

pipeline = spacy_stanza.load_pipeline("xx", lang='zh-hant')
vocab = pipeline.vocab

pipeline.set_error_handler(error_handler)

2023-03-21 10:18:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-03-21 10:18:39 INFO: Loading these models for language: zh-hant (Traditional_Chinese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2023-03-21 10:18:39 INFO: Using device: cpu
2023-03-21 10:18:39 INFO: Loading: tokenize
2023-03-21 10:18:39 INFO: Loading: pos
2023-03-21 10:18:39 INFO: Loading: lemma
2023-03-21 10:18:39 INFO: Loading: depparse
2023-03-21 10:18:39 INFO: Done loading processors!


In [5]:
pickle_dir = '.'
pickle_file = 'all_docs_stanza_opinion_extraction.pkl'

with open(pickle_dir + '/' + pickle_file, 'rb') as f:
    bytes_data = pickle.load(f)
    all_docs_stanza = [[Doc(vocab).from_bytes(doc_bytes) for doc_bytes in docs] for docs in bytes_data]

In [6]:
all_docs_stanza[0][1].spans

{'ckip_ner': [黃偉哲, 今天, 台南市, 元旦升旗典禮, 范雲, 立法院, 范雲], 'opinion_label': [表示, 他, 尊重立委的提案權，有很多國家女性是不用服兵役，因為立法院是合議制，要經過多數立委同意才有辦法修法，對於范雲的主張，他表示尊重。], 'coreference_label': [他, 黃偉哲], 'opinion_found[0]': [他, 表示, ，有很多國家女性是不用服兵役，因為立法院是合議制，要經過多數立委同意才有辦法修法，對於范雲的主張，他表示尊重], 'opinion_found': [他, 表示, ，有很多國家女性是不用服兵役，因為立法院是合議制，要經過多數立委同意才有辦法修法，對於范雲的主張，他表示尊重], 'pronounce_in_label': [他]}

# To db

In [7]:
from postgresql.connection import Connection

In [8]:
all_docs_stanza[0][0]._.paragraph_index

0

In [9]:
import uuid
import random
import time

In [14]:
table_name = 'opinion_extract_200_news'

for docs in all_docs_stanza:
    for doc in docs:
        opinion_index = 0
        while f"opinion_found[{opinion_index}]" in doc.spans:
            opinion_data = {
                    'opinion_uid': str(uuid.uuid1()),
                    'news_uid': doc._.news_uid,
                    'paragraph_index': doc._.paragraph_index,
                    'opinion_index_in_paragraph': opinion_index,
                    'OPINION_SRCs': [],
                    'OPINION_OPRs': [],
                    'OPINION_SEGs': [],
                    'OPINION_SRC_resolution': None,
                    'OPINION_SRC_name_found': None,
                    'opinion_group_id': str(random.randint(1,10)),
                    'opinion_group_show': True,
                } 
            for span in doc.spans[f"opinion_found[{opinion_index}]"]:
                if span.label_ == 'OPINION_SRC_match':
                    opinion_data['OPINION_SRCs'].append(span.text)
                elif span.label_ == 'OPINION_OPR_match':
                    opinion_data['OPINION_OPRs'].append(span.text)
                elif span.label_ == 'OPINION_SEG_match':
                    opinion_data['OPINION_SEGs'].append(span.text)
                
            print(opinion_data)
            Connection.create_opinion_extraction_result(opinion_data, table_name)

            # time.sleep(0.3)

            # for span in doc.spans[f"opinion_found"]:
            opinion_index += 1

{'opinion_uid': '0e1cf292-c78f-11ed-b622-acde48001122', 'news_uid': '2033c500-8978-11ed-ade1-69f5fa58c5e9', 'paragraph_index': 0, 'opinion_index_in_paragraph': 0, 'OPINION_SRCs': ['台南市長黃偉哲'], 'OPINION_OPRs': ['表示'], 'OPINION_SEGs': ['，希望多所考量，並多徵詢各方意見。'], 'OPINION_SRC_resolution': None, 'OPINION_SRC_name_found': None, 'opinion_group_id': '10', 'opinion_group_show': True}
{'opinion_uid': '10c1f77c-c78f-11ed-b622-acde48001122', 'news_uid': '2033c500-8978-11ed-ade1-69f5fa58c5e9', 'paragraph_index': 1, 'opinion_index_in_paragraph': 0, 'OPINION_SRCs': ['他'], 'OPINION_OPRs': ['表示'], 'OPINION_SEGs': ['，有很多國家女性是不用服兵役，因為立法院是合議制，要經過多數立委同意才有辦法修法，對於范雲的主張，他表示尊重'], 'OPINION_SRC_resolution': None, 'OPINION_SRC_name_found': None, 'opinion_group_id': '9', 'opinion_group_show': True}
{'opinion_uid': '132f0112-c78f-11ed-b622-acde48001122', 'news_uid': '84218a8c-8980-11ed-ac07-e71f0c693353', 'paragraph_index': 0, 'opinion_index_in_paragraph': 0, 'OPINION_SRCs': ['蔡'], 'OPINION_OPRs': ['揭示'], 'OPINION_SEGs'