In [4]:
import jsonlines
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
from tqdm import tqdm
import re

# Find POS tag of coreference mentions

In [39]:
train_conll = open("data/ontonotes/data/train.english.v4_gold_conll").read()
dev_conll = open("data/ontonotes/data/dev.english.v4_gold_conll").read()
test_conll = open("data/ontonotes/data/test.english.v4_gold_conll").read()

In [59]:
documents = [[],[],[]]

for k, partition in enumerate([train_conll, dev_conll, test_conll]):
    
    for line in tqdm(partition.split("\n"), desc="reading lines"):
        
        if line.startswith("#begin"):
            document = []
        
        elif line.startswith("#end"):
            documents[k].append(document)
            
        else:
            document.append(line.split())

reading lines: 100%|██████████| 2760207/2760207 [00:11<00:00, 236548.70it/s]
reading lines: 100%|██████████| 346787/346787 [00:00<00:00, 451197.28it/s]
reading lines: 100%|██████████| 359509/359509 [00:01<00:00, 185902.24it/s]


In [60]:
len(documents[0]), len(documents[1]), len(documents[2])

(5604, 686, 696)

In [97]:
mention_postag = []
mention_word = []

for k, docs in enumerate(documents):
    for document in tqdm(docs, desc="finding postag of mentions"):
        
        i = 0
        while i < len(document):
            
            row = document[i]
            if row and row[-1] != "-":
                
                postag = [row[4]]
                word = [row[3]]
                j = i + 1
                while j < len(document):

                    jrow = document[j]
                    if jrow and jrow[-1][-1] != ")":
                        postag.append(jrow[4])
                        word.append(jrow[3])
                    else:
                        break
                    j += 1
                i = j
                
                mention_postag.append(postag)
                mention_word.append(word)
            else:
                i += 1

finding postag of mentions: 100%|██████████| 5604/5604 [00:03<00:00, 1787.92it/s]
finding postag of mentions: 100%|██████████| 686/686 [00:00<00:00, 1495.68it/s]
finding postag of mentions: 100%|██████████| 696/696 [00:02<00:00, 312.75it/s]


In [98]:
len(mention_postag), len(mention_word)

(434248, 434248)

In [99]:
mention_postag_hashable = [" ".join(l) for l in mention_postag]

In [106]:
mention_postag_distribution = sorted(Counter(mention_postag_hashable).items(), key=lambda item: item[1], reverse=True)
for postag, count in mention_postag_distribution:
    print(f"{postag}\t\t\t\t{count/len(mention_postag_hashable):.6f}")

DT				0.038070
NNP				0.034123
NN .				0.031019
NNP .				0.030978
PRP .				0.019523
PRP$				0.014959
NNS .				0.009736
PRP VBD				0.009621
PRP VBP				0.009336
PRP				0.006867
DT JJ				0.005513
NN				0.005214
POS				0.004836
NNP VBD				0.004721
PRP$ NN .				0.004638
NNP NNP				0.004265
DT NNP				0.004076
PRP MD VB				0.004025
DT NN				0.003942
NNP CC				0.003896
PRP IN				0.003477
PRP VBD IN				0.003450
NNP ,				0.003307
NN IN				0.002925
NNP IN				0.002828
JJ				0.002809
PRP$ NNS .				0.002699
NN ,				0.002607
PRP VBP IN				0.002501
PRP VBP .				0.002386
DT .				0.002280
NNP VBD IN				0.002109
PRP$ JJ				0.002105
NNP VBZ				0.001980
PRP . ''				0.001980
PRP VBD .				0.001898
PRP VBD DT				0.001879
.				0.001847
PRP VBZ				0.001842
PRP VB				0.001819
NN VBD				0.001732
PRP$ NN				0.001676
NNS IN				0.001649
PRP ,				0.001511
POS NN .				0.001455
PRP$ NN IN				0.001414
NNP . ''				0.001377
PRP VBZ .				0.001340
PRP RB .				0.001290
NNS				0.001248
PRP MD RB VB				0.001234
NN . ''				0.00

In [102]:
mention_postag_hashable = np.array(mention_postag_hashable)

In [103]:
mention_postag_hashable

array(['NNP , JJ NNPS IN NNP', 'NNP', 'NNP', ...,
       'NNP NNPS , NNP NNP NNP , NNP NNP NNP , NNP , CD',
       'PRP VBP RB RB VBN IN NNP POS NN IN DT JJ NN , PRP VBZ NNP CC NNP IN DT JJ NN IN NN WDT VBZ RB VBN',
       'PRP .'], dtype='<U429')

In [104]:
mention_word_joined = np.array([" ".join(l) for l in mention_word])

In [105]:
mention_word_joined[mention_postag_hashable == "DT"][:100]

array(['This', 'a', 'This', 'This', 'this', 'The', 'Some', 'this', 'this',
       'the', 'The', 'the', 'the', 'the', 'The', 'the', 'the', 'this',
       'This', 'the', 'The', 'this', 'this', 'this', 'this', 'the', 'the',
       'The', 'a', 'this', 'this', 'this', 'the', 'The', 'the', 'This',
       'the', 'this', 'the', 'the', 'Both', 'the', 'this', 'this', 'The',
       'that', 'This', 'the', 'the', 'that', 'that', 'that', 'this',
       'this', 'both', 'the', 'both', 'this', 'both', 'this', 'the',
       'The', 'the', 'this', 'this', 'the', 'the', 'the', 'this', 'this',
       'the', 'this', 'this', 'the', 'the', 'the', 'the', 'this', 'the',
       'the', 'the', 'the', 'the', 'the', 'this', 'this', 'the', 'this',
       'this', 'the', 'this', 'The', 'this', 'that', 'the', 'The', 'the',
       'This', 'the', 'a'], dtype='<U856')