# Import libs

In [1]:
import multiprocessing
import pandas as pd

from datasets import load_dataset, concatenate_datasets
from nltk.tree import Tree
from nltk.grammar import Nonterminal

from typing import Dict, Optional, List

from tqdm import tqdm

from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Load dataset

In [3]:
ontonotes = load_dataset('conll2012_ontonotesv5', 'english_v12', cache_dir='.cache')

In [4]:
ontonotes

DatasetDict({
    train: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 10539
    })
    validation: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1370
    })
    test: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1200
    })
})

In [5]:
ontonotes = concatenate_datasets([ontonotes['train'], ontonotes['validation'], ontonotes['test']])
ontonotes

Dataset({
    features: ['document_id', 'sentences'],
    num_rows: 13109
})

# Preprocess trees

## Extract trees

In [6]:
ontn_sentc_tree = []

for documents in tqdm(ontonotes):
    for sentence in documents['sentences']:
        if sentence['parse_tree'] is not None:
            ontn_sentc_tree.append({'parse_tree': sentence['parse_tree'], 'words': sentence['words']})

100%|██████████| 13109/13109 [00:21<00:00, 599.76it/s] 


In [7]:
len(ontn_sentc_tree)

137036

## Find rare top cases

In [8]:
top_prods_dict = dict()
for elem in tqdm(ontn_sentc_tree):
    tree = Tree.fromstring(elem['parse_tree'])
    top_prod = str(tree.productions()[0])
    if top_prod in top_prods_dict.keys():
        top_prods_dict[top_prod] += 1
    else:
        top_prods_dict[top_prod] = 1
top_prods_dict

100%|██████████| 137036/137036 [00:11<00:00, 11923.41it/s]


{'TOP -> SBARQ': 2238,
 'TOP -> S': 115580,
 'TOP -> NP': 3662,
 'TOP -> SINV': 2560,
 'TOP -> INTJ': 4711,
 'TOP -> SQ': 2233,
 'TOP -> X': 329,
 'TOP -> SBAR': 486,
 'TOP -> FRAG': 4065,
 'TOP -> VP': 63,
 'TOP -> ADVP': 229,
 'TOP -> PP': 218,
 'TOP -> ADJP': 212,
 'TOP -> WHADVP': 2,
 'TOP -> WHNP': 12,
 'TOP -> UCP': 135,
 'TOP -> META': 227,
 'TOP -> S .': 7,
 'TOP -> PRN': 31,
 'TOP -> LST': 10,
 'TOP -> .': 4,
 'TOP -> ,': 1,
 'TOP -> `` S': 15,
 'TOP -> PP .': 1,
 'TOP -> SINV .': 1,
 'TOP -> SQ .': 1,
 'TOP -> WHADJP': 3}

In [9]:
top_prods = pd.Series(top_prods_dict, name='N').to_frame()
top_prods['frac'] = top_prods['N'] / len(ontn_sentc_tree)
top_prods = top_prods.sort_values('frac', ascending=False)
mask = top_prods['frac'] > 0.003
available_top_prods = top_prods[mask].index.to_list()
print(available_top_prods)

['TOP -> S', 'TOP -> INTJ', 'TOP -> FRAG', 'TOP -> NP', 'TOP -> SINV', 'TOP -> SBARQ', 'TOP -> SQ', 'TOP -> SBAR']


In [10]:
available_top_prods

['TOP -> S',
 'TOP -> INTJ',
 'TOP -> FRAG',
 'TOP -> NP',
 'TOP -> SINV',
 'TOP -> SBARQ',
 'TOP -> SQ',
 'TOP -> SBAR']

## Delete rare top cases

In [11]:
print(len(ontn_sentc_tree))

137036


In [12]:
ontn_sentc_tree_prc0 = []
for elem in tqdm(ontn_sentc_tree):
    tree = Tree.fromstring(elem['parse_tree'])
    top_prod = str(tree.productions()[0])
    if top_prod in available_top_prods:
        ontn_sentc_tree_prc0.append(elem)

100%|██████████| 137036/137036 [00:11<00:00, 11941.07it/s]


In [13]:
print(len(ontn_sentc_tree_prc0))

135535


## Extract rare DT cases

In [14]:
def extract_all_spec_substring(s: str, start_token: str, end_token: str = ')') -> List[Optional[str]]:
    results = []
    start = 0
    while True:
        start_pos = s.find(start_token, start)
        if start_pos == -1:
            break
        end_pos = s.find(end_token, start_pos + len(start_token))
        if end_pos != -1:
            substring = s[start_pos: end_pos + 1]
            results.append(substring)
        start = start_pos + 1
    return results

In [15]:
dt_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc0):
    substr = '(DT'
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in dt_nt.keys():
                dt_nt[extraction] += 1
            else:
                dt_nt[extraction] = 1
dt_nt = pd.Series(dt_nt, name='N').to_frame()
dt_nt = dt_nt.sort_values('N', ascending=False)
display(dt_nt.T)
del_dt = dt_nt[dt_nt['N'] <= 22].index.to_list()

Unnamed: 0,(DT the),(DT a),(DT The),(DT this),(DT an),(DT that),(DT some),(DT all),(DT these),(DT This),(DT those),(DT no),(DT any),(DT A),(DT That),(DT another),(DT each),(DT every),(DT both),(DT Some),(DT These),(DT No),(DT All),(DT An),(DT Those),(DT Another),(DT Both),(DT Each),(DT Every),(DT either),(DT Any),(DT THE),(DT neither),(DT half),(DT Neither),(DT many),(DT ALL),(DT several),(DT AN),(DT such),(DT Either),(DT them),(DT ANY),(DT th-),(DT tha-),(DT he),(DT NO),(DT del),(DT la),(DT and),(DT Many),(DT new),(DT yours),(DT Anshe),(DT No.),(DT Half),(DT nary),(DT '99),(DT {the?}),(DT al),(DT {a}),(DT whatever),(DT Not),(DT most),(DT production),"(DT 30,000)",(DT comparatively),(DT THOSE),(DT BOTH),(DT 1/6),(DT ten),(DT a-),(DT the-),(DT an-),(DT &the),(DT tho-),(DT al-),(DT ●The),(DT 1/3),(DT i),(DT _that_),(DT EVERY),(DT thislast),(DT outlaws),(DT gold),(DT le),(DT anoth-),(DT Thomas),(DT China),(DT to),(DT email),(DT Dis),(DT Dat),(DT THAT),(DT now)
N,120502,41633,14776,9709,6644,6116,3454,3115,2410,2314,2240,2229,1989,1856,1484,1090,974,891,750,642,600,291,279,275,223,153,149,122,117,64,59,42,38,37,23,22,8,8,8,7,7,6,3,3,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


## Delete rare DT cases

In [16]:
len(ontn_sentc_tree_prc0)

135535

In [17]:
def has_substring(input_str: str, substrings: List[str]) -> bool:
    return any(s in input_str for s in substrings)

In [18]:
ontn_sentc_tree_prc1 = []
for i, sent in enumerate(ontn_sentc_tree_prc0):
    if not has_substring(sent['parse_tree'], del_dt):
        ontn_sentc_tree_prc1.append(sent)

In [19]:
len(ontn_sentc_tree_prc1)

135404

## Extract rare Punctuation

In [20]:
pn_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc1):
    substr = '(:'
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in pn_nt.keys():
                pn_nt[extraction] += 1
            else:
                pn_nt[extraction] = 1
pn_nt = pd.Series(pn_nt, name='N').to_frame()
pn_nt = pn_nt.sort_values('N', ascending=False)
display(pn_nt.head(60).T)
del_pnct = pn_nt[pn_nt['N'] <= 289].index.to_list()
del_pnct

Unnamed: 0,(: :),(: --),(: ;),(: -),(: ...),(: /),(: '),"(: ,)",(: ---),(: !),(: ：),(: ------)
N,3271,3271,2012,532,289,36,5,2,2,1,1,1


['(: ...)',
 '(: /)',
 "(: ')",
 '(: ,)',
 '(: ---)',
 '(: !)',
 '(: ：)',
 '(: ------)']

## Delete rare punctuation

In [21]:
len(ontn_sentc_tree_prc1)

135404

In [22]:
ontn_sentc_tree_prc2 = []
for i, sent in enumerate(ontn_sentc_tree_prc1):
    if not has_substring(sent['parse_tree'], del_pnct):
        ontn_sentc_tree_prc2.append(sent)

In [23]:
len(ontn_sentc_tree_prc2)

135096

## Extract rare comma

In [24]:
comma_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc2):
    substr = '(,'
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in comma_nt.keys():
                comma_nt[extraction] += 1
            else:
                comma_nt[extraction] = 1
comma_nt = pd.Series(comma_nt, name='N').to_frame()
comma_nt = comma_nt.sort_values('N', ascending=False)
display(comma_nt.head(60).T)
del_comma = comma_nt[comma_nt['N'] <= 316].index.to_list()
print(del_comma)

Unnamed: 0,"(, ,)","(, ;)","(, -)","(, .)","(, /)","(, ?)","(, !)","(, --)","(, :)","(, ...)","(, ---)","(, ??)","(, "")","(, television)","(, section)","(, 2)","(, ....)","(, ..)","(, !!!!)","(, Wa)","(, !!)","(, ?!)"
N,112168,316,292,151,82,63,33,19,14,7,4,3,2,1,1,1,1,1,1,1,1,1


['(, ;)', '(, -)', '(, .)', '(, /)', '(, ?)', '(, !)', '(, --)', '(, :)', '(, ...)', '(, ---)', '(, ??)', '(, ")', '(, television)', '(, section)', '(, 2)', '(, ....)', '(, ..)', '(, !!!!)', '(, Wa)', '(, !!)', '(, ?!)']


## Delete rare comma

In [25]:
len(ontn_sentc_tree_prc2)

135096

In [26]:
ontn_sentc_tree_prc3 = []
for i, sent in enumerate(ontn_sentc_tree_prc2):
    if not has_substring(sent['parse_tree'], del_comma):
        ontn_sentc_tree_prc3.append(sent)

In [27]:
len(ontn_sentc_tree_prc3)

134405

## Extract rare dots

In [28]:
dots_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc3):
    substr = '(.'
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in dots_nt.keys():
                dots_nt[extraction] += 1
            else:
                dots_nt[extraction] = 1
dots_nt = pd.Series(dots_nt, name='N').to_frame()
dots_nt = dots_nt.sort_values('N', ascending=False)
display(dots_nt.T)
del_dots = dots_nt[dots_nt['N'] <= 168].index.to_list()
del_dots.append('(. /-)')
print(del_dots)

Unnamed: 0,(. .),(. /.),(. ?),(. !),(. /?),(. /-),(. ...),(. --),(. ..),"(. ,)",(. :),(. !!),(. -),(. ;),(. ....),(. !!!),(. ???),(. ??),(. .....),(. ?!),(. !!!!),(. !?),(. !.),(. ..!),(. ......),"(. "")",(. .......),(. ;.),(. ..?),(. ...!),(. !!?),(. !!??),(. ?!!...),(. /),(. [E_S),(. <E_S),"(. ,,)",(. ..!!),(. 0.),(. !!!!!!!!!!!!!!!!),(. !!.),(. ?...),(. ........),(. ?!!!),(. ?..),(. ......................),(. !!!!.),(. ....?),(. ?@@@...),(. ..!!.),(. !*),(. ..!*),(. ?!!),(. .?),(. .-),(. ??.),(. ?!.),(. :.),(. !!!!!),(. !!!!!!!!)
N,105935,9257,4798,1605,993,289,184,168,144,99,32,31,30,27,24,11,9,8,8,5,5,5,4,4,4,4,3,3,3,3,3,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


['(. --)', '(. ..)', '(. ,)', '(. :)', '(. !!)', '(. -)', '(. ;)', '(. ....)', '(. !!!)', '(. ???)', '(. ??)', '(. .....)', '(. ?!)', '(. !!!!)', '(. !?)', '(. !.)', '(. ..!)', '(. ......)', '(. ")', '(. .......)', '(. ;.)', '(. ..?)', '(. ...!)', '(. !!?)', '(. !!??)', '(. ?!!...)', '(. /)', '(. [E_S)', '(. <E_S)', '(. ,,)', '(. ..!!)', '(. 0.)', '(. !!!!!!!!!!!!!!!!)', '(. !!.)', '(. ?...)', '(. ........)', '(. ?!!!)', '(. ?..)', '(. ......................)', '(. !!!!.)', '(. ....?)', '(. ?@@@...)', '(. ..!!.)', '(. !*)', '(. ..!*)', '(. ?!!)', '(. .?)', '(. .-)', '(. ??.)', '(. ?!.)', '(. :.)', '(. !!!!!)', '(. !!!!!!!!)', '(. /-)']


## Delete rare dots

In [29]:
len(ontn_sentc_tree_prc3)

134405

In [30]:
ontn_sentc_tree_prc4 = []
for i, sent in enumerate(ontn_sentc_tree_prc3):
    if not has_substring(sent['parse_tree'], del_dots):
        ontn_sentc_tree_prc4.append(sent)

In [31]:
len(ontn_sentc_tree_prc4)

133449

## Extract opening quotation marks

In [32]:
oqm_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc4):
    substr = '(``'
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in oqm_nt.keys():
                oqm_nt[extraction] += 1
            else:
                oqm_nt[extraction] = 1
oqm_nt = pd.Series(oqm_nt, name='N').to_frame()
oqm_nt = oqm_nt.sort_values('N', ascending=False)
display(oqm_nt.T)
del_oqm= oqm_nt[oqm_nt['N'] <= 10].index.to_list()
print(del_oqm)

Unnamed: 0,(`` ``),"(`` "")",(`` '),(`` `),(`` '')
N,11073,3448,371,102,9


["(`` '')"]


## Delete opening quotation marks

In [33]:
len(ontn_sentc_tree_prc4)

133449

In [34]:
ontn_sentc_tree_prc5 = []
for i, sent in enumerate(ontn_sentc_tree_prc4):
    if not has_substring(sent['parse_tree'], del_oqm):
        ontn_sentc_tree_prc5.append(sent)

In [35]:
len(ontn_sentc_tree_prc5)

133440

## Extract end quotation marks

In [36]:
eqm_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc5):
    substr = "(''"
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in eqm_nt.keys():
                eqm_nt[extraction] += 1
            else:
                eqm_nt[extraction] = 1
eqm_nt = pd.Series(eqm_nt, name='N').to_frame()
eqm_nt = eqm_nt.sort_values('N', ascending=False)
display(eqm_nt.T)
del_eqm= eqm_nt[eqm_nt['N'] <= 15].index.to_list()
print(del_eqm)

Unnamed: 0,('' ''),"('' "")",('' '),('' `),('' One),"('' ,)",('' .),('' O),"('' """")"
N,10430,3493,1371,16,1,1,1,1,1


["('' One)", "('' ,)", "('' .)", "('' O)", '(\'\' "")']


## Delete end quotation marks

In [37]:
len(ontn_sentc_tree_prc5)

133440

In [38]:
ontn_sentc_tree_prc6 = []
for i, sent in enumerate(ontn_sentc_tree_prc5):
    if not has_substring(sent['parse_tree'], del_eqm):
        ontn_sentc_tree_prc6.append(sent)

In [39]:
len(ontn_sentc_tree_prc6)

133435

## Extract all symbols

In [40]:
all_liters = set()
for i, sent in enumerate(ontn_sentc_tree_prc6):
    words = ''.join(sent['words'])
    all_liters.update(words)

In [41]:
print(all_liters)

{'\\', '6', '●', 'e', '[', 'の', '_', '【', '&', '￥', 'd', '.', 'I', 'O', '（', 'W', ':', 'V', '˙', 'r', 'B', 'a', 'M', 'R', 'Y', '?', '%', 'P', '1', '4', '·', '8', '9', 'o', 'J', '-', "'", '!', 'F', 'y', '】', 't', ',', '*', 'X', '}', 'D', 'j', 'G', '□', 'v', 'w', 'c', 'l', 'z', 'h', 'K', '0', '2', '’', 'q', 'L', '>', '/', '^', '`', 'p', 'Ì', '→', '＊', 'n', '<', 'ِ', '#', 'Q', 'g', '3', '7', 's', '+', 'C', 'T', 'm', 'N', '$', 'S', 'x', 'à', 'ö', 'U', ';', '@', 'E', '=', 'u', 'f', 'A', ']', '~', 'b', 'k', '{', '"', '■', '・', 'ò', 'Û', '5', 'H', 'Z', 'i', '）'}


In [42]:
error_symbs = [
    '□', '■', 'の', '【', '】', '●', 'ö', '{', '}', '＊',
    '\\', '<', '>', 'ò', '˙', 'Ì', '・', '→', 'Û', '·',
    '_', '￥', '’', '^', '’', '------', '---', '@'
]

## Remove trees with error symbols

In [43]:
print(len(ontn_sentc_tree_prc6))

133435


In [44]:
ontn_sentc_tree_prc7 = []
for i, sent in enumerate(ontn_sentc_tree_prc6):
    if not has_substring(sent['parse_tree'], error_symbs):
        ontn_sentc_tree_prc7.append(sent)

In [45]:
print(len(ontn_sentc_tree_prc7))

133231


## Extract NN + link

In [46]:
nnlink_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc7):
    substr = "(NN http"
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in nnlink_nt.keys():
                nnlink_nt[extraction] += 1
            else:
                nnlink_nt[extraction] = 1
nnlink_nt = pd.Series(nnlink_nt, name='N').to_frame()
nnlink_nt = nnlink_nt.sort_values('N', ascending=False)
display(nnlink_nt.T)
del_nnlink_nt = nnlink_nt[nnlink_nt['N'] <= 15].index.to_list()

nnlink_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc7):
    substr = "(NN www."
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in nnlink_nt.keys():
                nnlink_nt[extraction] += 1
            else:
                nnlink_nt[extraction] = 1

nnlink_nt = pd.Series(nnlink_nt, name='N').to_frame()
nnlink_nt = nnlink_nt.sort_values('N', ascending=False)
display(nnlink_nt.T)

del_nnlink_nt += nnlink_nt[nnlink_nt['N'] <= 15].index.to_list()

print(del_nnlink_nt)

Unnamed: 0,(NN http://www.qin.com.tw),(NN http://www.bali.tpc.gov.tw/),(NN http://www.tamsui.gov.tw/)
N,1,1,1


Unnamed: 0,(NN www.Career.com),(NN www.alfalaq.com),(NN www.120zy.com),(NN www.taconet.com.tw/cstudio/)
N,1,1,1,1


['(NN http://www.qin.com.tw)', '(NN http://www.bali.tpc.gov.tw/)', '(NN http://www.tamsui.gov.tw/)', '(NN www.Career.com)', '(NN www.alfalaq.com)', '(NN www.120zy.com)', '(NN www.taconet.com.tw/cstudio/)']


## Delete NN + link

In [47]:
len(ontn_sentc_tree_prc7)

133231

In [48]:
ontn_sentc_tree_prc8 = []
for i, sent in enumerate(ontn_sentc_tree_prc7):
    if not has_substring(sent['parse_tree'], del_nnlink_nt):
        ontn_sentc_tree_prc8.append(sent)

In [49]:
len(ontn_sentc_tree_prc8)

133224

## Extract rare cases with -LRB- / -RRB-

In [50]:
lrb_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc8):
    substr = "(-"
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in lrb_nt.keys():
                lrb_nt[extraction] += 1
            else:
                lrb_nt[extraction] = 1
lrb_nt = pd.Series(lrb_nt, name='N').to_frame()
lrb_nt = lrb_nt.sort_values('N', ascending=False)
display(lrb_nt.T)
del_lrb_nt = lrb_nt[lrb_nt['N'] <= 9].index.to_list()
print(del_lrb_nt)

Unnamed: 0,(-RRB- -RRB-),(-LRB- -LRB-),(-RRB- -RCB-),(-LRB- -LCB-),(-RRB- -RSB-),(-LRB- -LSB-),(-LRB- [),(-RRB- ]),(-LRB- -),(-RRB- -),(-LRB- -LRB),(-RRB- -LRB-),(-RRB- -RRB)
N,2817,2777,181,178,52,48,9,9,1,1,1,1,1


['(-LRB- [)', '(-RRB- ])', '(-LRB- -)', '(-RRB- -)', '(-LRB- -LRB)', '(-RRB- -LRB-)', '(-RRB- -RRB)']


## Delete rare cases with -LRB- / -RRB-

In [51]:
len(ontn_sentc_tree_prc8)

133224

In [52]:
ontn_sentc_tree_prc9 = []
for i, sent in enumerate(ontn_sentc_tree_prc8):
    if not has_substring(sent['parse_tree'], del_lrb_nt):
        ontn_sentc_tree_prc9.append(sent)

In [53]:
len(ontn_sentc_tree_prc9)

133213

## Extract rare cases with PRP&

In [54]:
prpdlr_nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc9):
    substr = "(PRP$"
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in prpdlr_nt.keys():
                prpdlr_nt[extraction] += 1
            else:
                prpdlr_nt[extraction] = 1
prpdlr_nt = pd.Series(prpdlr_nt, name='N').to_frame()
prpdlr_nt = prpdlr_nt.sort_values('N', ascending=False)
display(prpdlr_nt.T)
del_prpdlr_nt = prpdlr_nt[prpdlr_nt['N'] <= 1].index.to_list()
print(del_prpdlr_nt)

Unnamed: 0,(PRP$ his),(PRP$ their),(PRP$ its),(PRP$ your),(PRP$ my),(PRP$ our),(PRP$ her),(PRP$ His),(PRP$ My),(PRP$ Your),(PRP$ Our),(PRP$ Their),(PRP$ Its),(PRP$ Her),(PRP$ yours),(PRP$ ours),(PRP$ it's),(PRP$ thy),(PRP$ mine),(PRP$ YOUR),(PRP$ HIS),(PRP$ thier),(PRP$ there),(PRP$ you),(PRP$ yer),(PRP$ his/her),(PRP$ it),(PRP$ you're),(PRP$ +her),(PRP$ h),(PRP$ hr),(PRP$ UAV),(PRP$ ones)
N,6768,5315,4600,2988,2590,2137,1482,450,417,237,231,192,133,125,17,7,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1


['(PRP$ his/her)', '(PRP$ it)', "(PRP$ you're)", '(PRP$ +her)', '(PRP$ h)', '(PRP$ hr)', '(PRP$ UAV)', '(PRP$ ones)']


## Delete rare cases with PRP$

In [55]:
len(ontn_sentc_tree_prc9)

133213

In [56]:
ontn_sentc_tree_prc10 = []
for i, sent in enumerate(ontn_sentc_tree_prc9):
    if not has_substring(sent['parse_tree'], del_prpdlr_nt):
        ontn_sentc_tree_prc10.append(sent)

In [57]:
len(ontn_sentc_tree_prc10)

133206

## Replace Nonterminal in Trees

In [58]:
nt = dict()
for i, sent in enumerate(ontn_sentc_tree_prc9):
    substr = "(WP$"
    if substr in sent['parse_tree']:
        extractions = extract_all_spec_substring(sent['parse_tree'], substr)
        for extraction in extractions:
            if extraction in nt.keys():
                nt[extraction] += 1
            else:
                nt[extraction] = 1
print(nt)

{'(WP$ whose)': 365, '(WP$ Whose)': 4}


In [59]:
replace_dict = {
    '(. ?)': '(QM ?)',  # question mark
    '(. /?)': '(QM ?)',  # question mark
    '(. .)': '(PER .)',  # period
    '(. /.)': '(PER .)',  # period
    '(. !)': '(EM !)',  # exclamation mark
    '(. ...)': '(ELPSS ...)',  # ellipsis
    '(, ,)': '(CMA ,)',  # comma
    '(: :)': '(CLN :)',  # punctuation COLON
    '(: ;)': '(SMCLN ;)',  # punctuation semicolon
    '(: --)': '(DASH --)',  # punctuation dash
    '(: -)': '(HPHN -)',  # punctuation hyphen
    '(: ...)': '(PNCTELPSS ...)',  # punctuation ellipsis
    '(DT a)': '(IAC a)',  # indefinite article before a consonant
    '(DT A)': '(IAC A)',  # indefinite article before a consonant
    '(DT an)': '(IAV an)',  # indefinite article before a vowel
    '(DT An)': '(IAV An)',  # indefinite article before a vowel
    '(DT the)': '(DA the)', # definite article
    '(DT The)': '(DA The)', # definite article
    '(`` ")': '(OQTM \')',   # Opening quotation marks
    '(`` ``)': '(OQTM \')',   # Opening quotation marks
    '(`` \')': '(OQTM \')',   # Opening quotation marks
    '(`` `)': '(OQTM \')',   # Opening quotation marks
    '(\'\' ")': '(QTM \')',  # Quotation marks
    '(\'\' \'\')': '(QTM \')',  # Quotation marks
    '(\'\' \')': '(QTM \')',  # Quotation marks
    '(\'\' `)': '(QTM \')',  # Quotation marks
    '(-LRB- -LSB-)': '(_lrb_ _lsb_)',
    '(-RRB- -RSB-)': '(_rrb_ _rsb_)',
    '(-LRB- -LRB-)': '(_lrb_ _lrb_)',
    '(-RRB- -RRB-)': '(_rrb_ _rrb_)',
    '(-LRB- -LCB-)': '(_lrb_ _lcb_)',
    '(-RRB- -RCB-)': '(_rrb_ _rсb_)',
    '(PRP$': '(PRP_DLR',
    '(WP$': '(WP_DLR',
    '($':'(DLR',
}

In [60]:
ontn_sentc_tree_fin = []
for sent in tqdm(ontn_sentc_tree_prc10):
    new_sent = deepcopy(sent)
    for rule in replace_dict.keys():
        if rule in new_sent['parse_tree']:
            new_sent['parse_tree'] = new_sent['parse_tree'].replace(rule, replace_dict[rule])
    ontn_sentc_tree_fin.append(new_sent)

100%|██████████| 133206/133206 [00:02<00:00, 54178.85it/s]


# Get all productions like a DICT

In [61]:
def process_chunk(chunk: List[Dict]) -> Dict:
    local_dict = {}
    for elem in chunk:
        tree = Tree.fromstring(elem['parse_tree'])
        for prod in tree.productions():
            lhs = prod.lhs()
            rhs = prod.rhs()
            if lhs in local_dict:
                if rhs not in local_dict[lhs]:
                    local_dict[lhs].append(rhs)
            else:
                local_dict[lhs] = [rhs]
    return local_dict

def merge_dicts(dict_list: List[Dict]) -> Dict:
    merged = {}
    for local_dict in dict_list:
        for lhs, rhs_list in local_dict.items():
            if lhs not in merged:
                merged[lhs] = []
            existing_rhs = merged[lhs]
            for rhs in rhs_list:
                if rhs not in existing_rhs:
                    existing_rhs.append(rhs)
    return merged

In [62]:
num_processes = multiprocessing.cpu_count()
num_processes

16

In [63]:
total_elements = len(ontn_sentc_tree_fin)
chunk_size = max(1, total_elements // num_processes)
chunks = [ontn_sentc_tree_fin[i:i+chunk_size] for i in range(0, total_elements, chunk_size)]

with multiprocessing.Pool(processes=num_processes) as pool:
    chunk_dicts = pool.map(process_chunk, chunks)

productions_dict = merge_dicts(chunk_dicts)

In [64]:
# productions_dict: Dict[str, Optional[List[str]]] = dict()
# for elem in tqdm(ontn_sentc_tree):
#     tree = Tree.fromstring(elem['parse_tree'])

#     for prod in tree.productions():
#         if prod.lhs() in productions_dict.keys():
#             if prod.rhs() in productions_dict[prod.lhs()]:
#                 pass
#             else:
#                 productions_dict[prod.lhs()].append(prod.rhs())
#         else:
#             productions_dict[prod.lhs()] = [prod.rhs()]

In [65]:
productions_dict.keys()

dict_keys([TOP, SBARQ, WHNP, WP, NN, PP, IN, NP, QM, S, PRP, ADVP, RB, VP, VB, TO, IAC, JJ, NNP, PER, SINV, VBG, ADJP, VBZ, DA, NML, NNPS, VBN, CMA, NNS, CC, CD, VBD, JJS, DT, POS, PRT, RP, SBAR, WHADVP, WRB, IAV, INTJ, UH, MD, PRP_DLR, JJR, HYPH, RBR, PDT, SQ, WDT, VBP, QP, CONJP, CLN, EX, SMCLN, UCP, RBS, _lrb_, _rrb_, FRAG, EM, PRN, DASH, X, WHPP, WP_DLR, OQTM, QTM, WHADJP, LST, LS, NAC, SYM, RRC, AFX, FW, META, DLR, ADD, ELPSS, NFP, HPHN, NX, EMBED])

# Productions dict to grammar

In [66]:
ALL_RHS = list(productions_dict.keys())
lark_like_gram = ""
for lhs in tqdm(ALL_RHS):
    assert isinstance(lhs, Nonterminal)
    left = str(lhs).lower() + ': '
    right = ""
    right_len = len(productions_dict[lhs])
    for i, rhs_tuple in enumerate(productions_dict[lhs]):
        assert isinstance(rhs_tuple, tuple)
        for rhs in rhs_tuple:
            if isinstance(rhs, Nonterminal):
                right += str(rhs).lower() + ' '
            else:
                right += '\"' + rhs + '\"'
        right = right.strip()
        if i != right_len - 1:
            right += '|'
    # if '|' in right:
    #     right = '(' + right  + ')'
    lark_like_gram += left + right + "\n"

100%|██████████| 87/87 [00:00<00:00, 149.15it/s]


In [67]:
with open('en_ontonotesv5_extract_lark.lark', 'w') as file:
    print(lark_like_gram, file=file)