In [8]:
import spacy
from spacy.tokens import Doc

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)



In [9]:
import benepar
from benepar.spacy_plugin import BeneparComponent
spacy.require_gpu(0)
benepar.download('benepar_en')
nlp = spacy.load('en_core_web_sm',disable=["ner"])
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
nlp.add_pipe(BeneparComponent('benepar_en'))

import nltk
from nltk.tree import Tree
from nltk.tree import ParentedTree

[nltk_data] Downloading package benepar_en to
[nltk_data]     /home/gaurish/nltk_data...
[nltk_data]   Package benepar_en is already up-to-date!


In [10]:
import csv
def _read(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter=",")
        tokens = []
        tags = []
        for row in reader:
            word, label = row

            if word == '\n':
                continue
            if word == "*START-SENTENCE*":
                tokens = []
                tags = []
            elif word == "*END-SENTENCE*":
                if len(tokens) > 1:
                    yield " ".join(tokens)
                tokens = []
                tags = []
            elif word == "*START-ACTOR*" or word == "*END-ACTOR*":
                continue
            else:
                if label not in {'I-cue', 'I-scope', 'O'}:
                    raise RuntimeError(
                        'Label %s is not a valid tag' % label)
                tokens.append(word)
                tags.append(label)
        

In [11]:
train_data = _read('data/unlabelled_conandoyle_train.csv')
train_data_full = _read('data/unlabelled_conandoyle_train_full.csv')
dev_data = _read('data/labelled_conandoyle_dev.csv')
test_data = _read('data/labelled_conandoyle_test.csv')

In [12]:
train_data =list(train_data)
len((train_data))


699

In [13]:
train_data_full =list(train_data_full)
len( (train_data_full))


842

In [14]:
dev_data = list(dev_data)
len( (dev_data))

144

In [15]:
test_data = list(test_data)
len( (test_data))

235

In [16]:
import jsonlines



In [17]:
dataset = []

for doc in nlp.pipe(train_data, n_threads=8, batch_size=10000):
#     sent = list(doc.sents)[0]
    key = " ".join([token.text for token in doc])
    dataset.append({
        key :{ 
            "parse_string":[sent._.parse_string for sent in doc.sents] ,
            "pos_tags":[token.pos_ for token in doc]
    }
                   })
with jsonlines.open('spacy_features_train.jsonl', 'w') as writer:
    writer.write_all(dataset)


In [18]:
len(dataset),dataset[0]

(699,
 {'Mr. Sherlock Holmes , who was usually very late in the mornings , save upon those not infrequent occasions when he was up all night , was seated at the breakfast table .': {'parse_string': ['(S (NP (NP (NNP Mr.) (NNP Sherlock) (NNP Holmes)) (, ,) (SBAR (SBAR (WHNP (WP who)) (S (VP (VBD was) (ADVP (RB usually)) (RB very) (JJ late) (PP (IN in) (NP (DT the) (NNS mornings)))))) (, ,) (VP (VB save) (PP (IN upon) (NP (DT those) (ADJP (RB not) (JJ infrequent)) (NNS occasions))) (SBAR (WHADVP (WRB when)) (S (NP (PRP he)) (VP (VBD was) (ADVP (RB up)) (NP (DT all) (NN night))))))) (, ,)) (VP (VBD was) (VP (VBN seated) (PP (IN at) (NP (DT the) (NN breakfast) (NN table))))) (. .))'],
   'pos_tags': ['PROPN',
    'PROPN',
    'PROPN',
    'PUNCT',
    'PRON',
    'AUX',
    'ADV',
    'ADV',
    'ADJ',
    'ADP',
    'DET',
    'NOUN',
    'PUNCT',
    'VERB',
    'SCONJ',
    'DET',
    'PART',
    'ADJ',
    'NOUN',
    'ADV',
    'PRON',
    'AUX',
    'ADV',
    'DET',
    'NOUN',
    

In [19]:
list(dev_data)

['He made no remark , but the matter remained in his thoughts , for he stood in front of the fire afterwards with a thoughtful face , smoking his pipe , and casting an occasional glance at the message .',
 'No woman would ever send a reply-paid telegram .',
 'My mind is like a racing engine , tearing itself to pieces because it is not connected up with the work for which it was built .',
 "`` I have had a most singular and unpleasant experience , Mr. Holmes , '' said he .",
 '`` Never in my life have I been placed in such a situation .',
 'It is most improper -- most outrageous .',
 '`` Well , sir , it did not appear to be a matter which concerned the police , and yet , when you have heard the facts , you must admit that I could not leave it where it was .',
 "Private detectives are a class with whom I have absolutely no sympathy , but none the less , having heard your name -- ''",
 "But , in the second place , why did you not come at once ? ''",
 "But no one can glance at your toilet 

In [20]:
dataset = []
for doc in nlp.pipe(dev_data, n_threads=8, batch_size=10000):
    sent = list(doc.sents)[0]
    key = " ".join([token.text for token in doc])
    dataset.append({
        key :{ 
                        "parse_string":[sent._.parse_string for sent in doc.sents] ,
            "pos_tags":[token.pos_ for token in doc]
    }
                   })
with jsonlines.open('spacy_features_dev.jsonl', 'w') as writer:
    writer.write_all(dataset)

In [21]:
len(dataset)

144

In [22]:
dataset = []
for doc in nlp.pipe(test_data, n_threads=8, batch_size=10000):
    sent = list(doc.sents)[0]
    key = " ".join([token.text for token in doc])
    dataset.append({
        key :{ 
           "parse_string":[sent._.parse_string for sent in doc.sents] ,
            "pos_tags":[token.pos_ for token in doc]
    }
                   })
with jsonlines.open('spacy_features_test.jsonl', 'w') as writer:
    writer.write_all(dataset)

In [23]:
len(dataset)

235

In [24]:
dataset = []
for doc in nlp.pipe(train_data_full, n_threads=8, batch_size=10000):
    sent = list(doc.sents)[0]
    key = " ".join([token.text for token in doc])
    dataset.append({
        key :{ 
           "parse_string":[sent._.parse_string for sent in doc.sents] ,
            "pos_tags":[token.pos_ for token in doc]
    }
                   })
with jsonlines.open('spacy_features_train_full.jsonl', 'w') as writer:
    writer.write_all(dataset)

In [25]:
len(dataset)

842

In [26]:
!cat spacy_features_train.jsonl spacy_features_dev.jsonl spacy_features_test.jsonl > spacy_features.jsonl


In [27]:
!cat spacy_features_train_full.jsonl spacy_features_dev.jsonl spacy_features_test.jsonl > spacy_features_full.jsonl



In [22]:
# ## convert jsonl to json
# import jsonlines
# output_dict ={}
# with jsonlines.open('spacy_features_test.jsonl') as reader:
#     for obj in reader:
#         for key, value in (obj.items()):
#             output_dict[key]=value
# with jsonlines.open('spacy_features_test.jsonl') as reader:
#     for obj in reader:
#         for key, value in (obj.items()):
#             output_dict[key]=value

In [13]:
output_dict

{'It is , however , unfortunately impossible entirely to separate the sensational from the criminal , and a chronicler is left in the dilemma that he must either sacrifice details which are essential to his statement and so give a false impression of the problem , or he must use matter which chance , and not choice , has provided him with .': {'parse_string': '(S (S (NP (NP (PRP It))) (VP (VBZ is) (, ,) (ADVP (RB however)) (, ,) (ADVP (RB unfortunately)) (ADJP (JJ impossible)) (ADVP (RB entirely)) (S (VP (TO to) (VP (VB separate) (NP (DT the) (NN sensational)) (PP (IN from) (NP (DT the) (NN criminal)))))))) (, ,) (CC and) (S (NP (DT a) (NN chronicler)) (VP (VBZ is) (VP (VBN left) (PP (IN in) (NP (DT the) (NN dilemma))) (SBAR (IN that) (S (S (NP (PRP he)) (VP (MD must) (CC either) (VP (VP (VB sacrifice) (NP (NP (NNS details)) (SBAR (WHNP (WDT which)) (S (VP (VBP are) (ADJP (JJ essential) (PP (IN to) (NP (PRP$ his) (NN statement))))))))) (CC and) (VP (ADVP (RB so)) (VP (VB give) (NP (NP 

In [4]:
text = "`` Know then that in the time of the Great Rebellion ( the history of which by the learned Lord Clarendon I most earnestly commend to your attention ) this Manor of Baskerville was held by Hugo of that name , nor can it be gainsaid that he was a most wild , profane , and godless man ."
doc = nlp(text)
sent = list(doc.sents)[0]
print(sent._.parse_string)

(S (`` ``) (VB Know) (ADVP (RB then)) (SBAR (IN that) (S (IN in) (NP (NP (NP (DT the) (NN time)) (PP (IN of) (NP (DT the) (NNP Great) (NNP Rebellion)))) (PRN (-LRB- -LRB-) (NP (NP (DT the) (NN history)) (SBAR (WHPP (IN of) (WHNP (WDT which))) (S (PP (IN by) (NP (DT the) (VBN learned) (NNP Lord) (NNP Clarendon))) (NP (PRP I)) (VP (ADVP (RBS most) (RB earnestly)) (VBP commend) (PP (IN to) (NP (PRP$ your) (NN attention))))))) (-RRB- -RRB-))))))


In [7]:
list(doc.sents)[1]


this Manor of Baskerville was held by Hugo of that name , nor can it be gainsaid that he was a most wild , profane , and godless man .