In [1]:
import stanza
from nltk import tokenize
from torch.cuda import empty_cache

In [87]:
from psutil import process_iter
from signal import SIGTERM # or SIGKILL

for proc in process_iter():
    try:
        for conns in proc.connections(kind='inet'):
            if conns.laddr.port == 9003:
                proc.send_signal(SIGTERM) # or SIGKILL
    except PermissionError:
        print("Permission error on process")

In [88]:
text = 'Science may be forced both ways by the tendency towards colossal quantities of research – both accelerating and trampling its progress'

with stanza.server.CoreNLPClient(
        annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse', 'coref'],
        timeout=120_000,
        memory='4G',
        be_quiet=False,
        endpoint='http://localhost:9003') as client:
    
    ann = client.annotate(text)
    modified_text = tokenize.sent_tokenize(text)

    for coref in ann.corefChain:

        antecedent = []
        for mention in coref.mention:
            phrase = []
            for i in range(mention.beginIndex, mention.endIndex):
                phrase.append(ann.sentence[mention.sentenceIndex].token[i].word)
            if antecedent == []:
                antecedent = ' '.join(word for word in phrase)
            else:
                anaphor = ' '.join(word for word in phrase)
                modified_text[mention.sentenceIndex] = modified_text[mention.sentenceIndex].replace(anaphor, antecedent)

    modified_text = ' '.join(modified_text)

2024-04-26 02:39:04 INFO: Writing properties to tmp file: corenlp_server-02734d9b33bc44bb.props
2024-04-26 02:39:04 INFO: Starting server with command: java -Xmx4G -cp C:\Users\ivano\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9003 -timeout 120000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-02734d9b33bc44bb.props -annotators tokenize,ssplit,pos,lemma,ner,parse,coref -preload -outputFormat serialized


In [89]:
modified_text

'Science may be forced both ways by the tendency towards colossal quantities of research – both accelerating and trampling Science progress'

In [2]:
pipe = stanza.Pipeline("en", processors="tokenize,coref,mwt,pos")

2024-04-26 03:54:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-26 03:54:47 INFO: Downloaded file to C:\Users\ivano\stanza_resources\resources.json
2024-04-26 03:54:47 INFO: Loading these models for language: en (English):
| Processor | Package                 |
---------------------------------------
| tokenize  | combined                |
| mwt       | combined                |
| pos       | combined_charlm         |
| coref     | ontonotes_electra-large |

2024-04-26 03:54:47 INFO: Using device: cuda
2024-04-26 03:54:47 INFO: Loading: tokenize
2024-04-26 03:54:48 INFO: Loading: mwt
2024-04-26 03:54:48 INFO: Loading: pos
2024-04-26 03:54:49 INFO: Loading: coref
  return self.fget.__get__(instance, owner)()
2024-04-26 03:54:57 INFO: Done loading processors!


In [3]:
text = 'The tendency towards colossal quantities of research may force science both ways – both accelerating and trampling its progress. In the context of this paper a concept is defined as an entity and its description(s). '

In [150]:
text = 'In mathematics, a limit is the value that a function (or sequence) approaches as the input (or index) approaches some value.[1] Limits are essential to calculus and mathematical analysis, and are used to define continuity, derivatives, and integrals.'

In [4]:
%%time
result = pipe(text)

CPU times: total: 875 ms
Wall time: 1.14 s


In [5]:
dicts = result.to_dict()

In [6]:
dicts

[[{'id': 1,
   'text': 'The',
   'upos': 'DET',
   'xpos': 'DT',
   'feats': 'Definite=Def|PronType=Art',
   'start_char': 0,
   'end_char': 3,
   'coref_chains': []},
  {'id': 2,
   'text': 'tendency',
   'upos': 'NOUN',
   'xpos': 'NN',
   'feats': 'Number=Sing',
   'start_char': 4,
   'end_char': 12,
   'coref_chains': []},
  {'id': 3,
   'text': 'towards',
   'upos': 'ADP',
   'xpos': 'IN',
   'start_char': 13,
   'end_char': 20,
   'coref_chains': []},
  {'id': 4,
   'text': 'colossal',
   'upos': 'ADJ',
   'xpos': 'JJ',
   'feats': 'Degree=Pos',
   'start_char': 21,
   'end_char': 29,
   'coref_chains': []},
  {'id': 5,
   'text': 'quantities',
   'upos': 'NOUN',
   'xpos': 'NNS',
   'feats': 'Number=Plur',
   'start_char': 30,
   'end_char': 40,
   'coref_chains': []},
  {'id': 6,
   'text': 'of',
   'upos': 'ADP',
   'xpos': 'IN',
   'start_char': 41,
   'end_char': 43,
   'coref_chains': []},
  {'id': 7,
   'text': 'research',
   'upos': 'NOUN',
   'xpos': 'NN',
   'feats': 'N

In [153]:
resd = {}
for a in dicts:
    for d in a:
        try:
            if d['coref_chains']:
                if d['coref_chains'][0].to_json()['representative_text'] in resd:
                    resd[d['coref_chains'][0].to_json()['representative_text']].append((d['text'], d['start_char'], d['end_char']))
                else:
                    resd[d['coref_chains'][0].to_json()['representative_text']] = [(d['text'], d['start_char'], d['end_char'])]
        except Exception as e:
            continue

In [154]:
resd

{}

In [155]:
modified_text = text
list_init = list(range(len(modified_text)))

for antecedent in resd:
    for mention in resd[antecedent]:
        if mention[0] not in antecedent:
            extp = f'{mention[0]} (->{antecedent})'
            modified_text = modified_text[:list_init.index(mention[1])] + extp + modified_text[list_init.index(mention[2]):]
            list_init = list_init[:list_init.index(mention[1])] + [-1]*len(extp) + list_init[list_init.index(mention[2]):]

In [156]:
modified_text

'In mathematics, a limit is the value that a function (or sequence) approaches as the input (or index) approaches some value.[1] Limits are essential to calculus and mathematical analysis, and are used to define continuity, derivatives, and integrals.'

In [149]:
modified_text

'The tendency towards colossal quantities of research may force science both ways – both accelerating and trampling its (->science) progress. In the context of this paper a concept is defined as an entity and its (->an entity) description(s). '

In [14]:
with open('Article.txt', 'r', encoding="utf8") as f:
    article = f.read()

In [16]:
print(article)

﻿
Prediction After a Horizon of Predictability:
Non-Predictable Points and Partial Multi-Step Prediction for Chaotic Time Series

Vasilii A. Gromov, Philip S. Baranov, and Alexandr Yu. Tsybakin
HSE University	
Pokrovskii Boulevard, 11, Moscow, Russian Federation
Vasilii A. Gromov, stroller@rambler.ru (corresponding author:)
Philip S. Baranov, pbaranov1306@gmail.com
Alexandr Yu. Tsybakin, a.tsby@yandex.ru
Abstract. The paper introduces several novel strategies for multi-step prediction of chaotic time series. Generalized z-vectors (irregular embeddings), comprising non-successive observations, make it possible to obtain a fairly large set of possible predicted values for each point to be predicted. Upon examining such a set, it may possible either to calculate a unified (‘final’) predicted value for a particular point (thus making it a ‘predictable’ point) or not (in which case the point becomes ‘unpredictable’). With non-predictable points, the multi-step prediction process is, in fact