In [1]:
import string
import unicodedata
import nltk
from nltk import corpus
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars
import re

In [None]:
nltk.__version__

In [None]:
# just for record here if needed
_period_context_fmt = r"""
    %(SentEndChars)s             # a potential sentence ending
    (?=(?P<after_tok>
        %(NonWord)s              # either other punctuation
        |
        \s*(?P<next_tok>\S+)     # or whitespace and some other token
    ))"""


In [20]:

PUNC = string.punctuation
PUNC_TRANSLATION_TABLE = str.maketrans(dict.fromkeys(PUNC))  # OR {key: None for key in string.punctuation}
BASE_STOPWORDS = {"的","一","不","在","人","有","是","为","以","于","上","他","而","后","之","来","及","了","因",
                  "下","可","到","由","这","与","也","此","但","并","个","其","已","无","小","我","们","起","最",
                  "再","今","去","好","只","又","或","很","亦","某","把","那","你","乃","它","吧","被","比","别",
                  "趁","当","从","得","打","凡","儿","尔","该","各","给","跟","和","何","还","即","几","既","看",
                  "据","距","靠","啦","另","么","每","嘛","拿","哪","您","凭","且","却","让","仍","啥","如","若",
                  "使","谁","虽","随","同","所","她","哇","嗡","往","些","向","沿","哟","用","咱","则","怎","曾",
                  "至","致","着","诸","自"}
NOUNS = ['NN', 'NNS', 'NNP', 'NNPS', 'PRP']
ENSURE_ASCII = False
CATEGORY_LINE_START = '[[Category:'
CATEGORY_LINE_CAPTURE = r'\[\[Category:([^\|]+).*\]\].*'
# https://en.wikipedia.org/wiki/Wikipedia:Namespace
STANDARD_NAMESPACE = {"category", "user", "help", "portal", "draft", "module", "file", "wikipedia", "wiktionary",
                      "wikt", "wp", "wt", "w", "cat", "image", "special", "template", "talk", "centralwikia",
                      "s", "creativecommons", "wikisource"}

##
# Recognize only these namespaces in links
# w: Internal links to the Wikipedia
# wiktionary: Wiki dictionary
# wikt: shortcut for Wiktionary
#
ACCEPTED_NAMESPACE = {'w', 'wiktionary', 'wikt'}


class CustomLangVars(PunktLanguageVars):
    sent_end_chars = ('.','。','！','？','…','!','?','；',';','﹗','！','!','﹖','？','?')
    

def sent_tokenize(sent):
    # TODO: chinese sent tokenized ok ? 
    return tokenize.sent_tokenize(sent)

def sent_offset_tokenize(sent):
    # TODO: chinese sent tokenized ok ? 
    # adding sentence ending character to correctly identify boundaries in different languages
    tokenizer = PunktSentenceTokenizer(lang_vars = CustomLangVars())
    return tokenizer.span_tokenize(sent)

def word_offset_tokenize(sent):
    return tokenize.WhitespaceTokenizer().span_tokenize(sent)

def word_tokenize(sent):
    return tokenize.word_tokenize(sent)

def pos_tag(tokens):
    return nltk.pos_tag(tokens)

def ngrams(tags, n):
    return nltk.ngrams(tags, n)

def get_lnrm(s, strip, lower):
    """Convert a string to its lnrm form
    We form the lower-cased normalized version l(s) of a string s by canonicalizing
    its UTF-8 characters, eliminating diacritics, lower-casing the UTF-8 and
    throwing out all ASCII-range characters that are not alpha-numeric.
    from http://nlp.stanford.edu/pubs/subctackbp.pdf Section 2.3
    Args:
        input string
    Returns:
        the lnrm form of the string
    """
    if not strip and not lower:
        return s
    lnrm = str(s)
    if lower:
        lnrm = lnrm.lower()
    if strip:
        lnrm = unicodedata.normalize('NFD', lnrm)
        lnrm = ''.join([x for x in lnrm if (not unicodedata.combining(x)
                                            and x.isalnum() or x == ' ')]).strip()
    # will remove if there are any duplicate white spaces e.g. "the  alias    is here"
    lnrm = " ".join(lnrm.split())
    return lnrm

class HumanNameParser:
    def __init__(self, name):
        self.last = ""
        self.first = name

In [21]:
page_text = """埃斯特拉达·帕尔马是在十年战争中一个重要的古巴将军。\n曾被西班牙军队俘虏并流放，在被放逐期间，他前往纽约，在那里他与何塞·马蒂一起工作。\n马蒂死后，埃斯特拉达·帕尔马成为古巴革命党新的领导人。\n他在华盛顿寻求。\n援助，成功地使美国国会通过联合决议，这项法案是导致美国宣战西班牙的其中一个因素，美国介入了战争，史称美西战争，要求古巴摆脱西班牙殖民统治。\n在伦纳德·伍德将军管理古巴几年后，举行了选举。\n以何塞·米格尔·戈麦斯为首的共和党保守派和阿尔弗雷·扎亚斯为首的国家自由派，双方支持埃斯特拉达·帕尔玛。\n他虽然停留的全部时间都在美国，在那儿他是一个公民。\n1901年12月31日埃斯特拉达·帕尔马当选总统，他的政治取向如同美国总统西奥多·罗斯福開明。\n美军离开后，古巴政府签署了一项法案，降低对美国产品关税，并将普拉特修正案纳入自己的宪法。1903年2月16日埃斯特拉达·帕尔玛签订了美国-古巴互惠条约，同意租借关塔那摩湾地区给美国，用作海军基地和煤站。\n1906年埃斯特拉达·帕尔马连任，但这次自由派反对，称选举舞弊。\n美国军事介入，不久帕尔马辞职，抗议罗斯福的意愿，查尔斯·马贡在古巴建立了另一个親美政府，美國对古巴实行军事占领。\n个人生活."""

In [22]:
sentence_all_data = []
sent_idx = 0
last_cur_offset = -1
for i, [cur_offset, end_offset] in enumerate(sent_offset_tokenize(page_text)):
    cur_offset = cur_offset if last_cur_offset == -1 else last_cur_offset
    sent = page_text[cur_offset:end_offset]
    print(sent)
    print('-----')

埃斯特拉达·帕尔马是在十年战争中一个重要的古巴将军。
-----
曾被西班牙军队俘虏并流放，在被放逐期间，他前往纽约，在那里他与何塞·马蒂一起工作。
-----
马蒂死后，埃斯特拉达·帕尔马成为古巴革命党新的领导人。
-----
他在华盛顿寻求。
-----
援助，成功地使美国国会通过联合决议，这项法案是导致美国宣战西班牙的其中一个因素，美国介入了战争，史称美西战争，要求古巴摆脱西班牙殖民统治。
-----
在伦纳德·伍德将军管理古巴几年后，举行了选举。
-----
以何塞·米格尔·戈麦斯为首的共和党保守派和阿尔弗雷·扎亚斯为首的国家自由派，双方支持埃斯特拉达·帕尔玛。
-----
他虽然停留的全部时间都在美国，在那儿他是一个公民。
-----
1901年12月31日埃斯特拉达·帕尔马当选总统，他的政治取向如同美国总统西奥多·罗斯福開明。
-----
美军离开后，古巴政府签署了一项法案，降低对美国产品关税，并将普拉特修正案纳入自己的宪法。1903年2月16日埃斯特拉达·帕尔玛签订了美国-古巴互惠条约，同意租借关塔那摩湾地区给美国，用作海军基地和煤站。
-----
1906年埃斯特拉达·帕尔马连任，但这次自由派反对，称选举舞弊。
-----
美国军事介入，不久帕尔马辞职，抗议罗斯福的意愿，查尔斯·马贡在古巴建立了另一个親美政府，美國对古巴实行军事占领。
-----
个人生活.
-----


In [23]:

text = '''It is thought to spread mainly from person to person。mainly through respiratory droplets produced when an infected person coughs or sneezes. These droplets can land in the mouths or noses of people who are nearby or possibly be inhaled into the lungs. Spread is more likely when people are in close contact with one another (within about 6 feet). It may be possible that a person can get COVID-19 by touching a surface or object that has the virus on it and then touching their own mouth, nose, or possibly their eyes. This is not thought to be the main way the virus spreads, but we are still learning more about this virus.'''


tokenizer = PunktSentenceTokenizer(lang_vars = CustomLangVars())
for sent in tokenizer.tokenize(text):
    print(sent)
    print('---')

It is thought to spread mainly from person to person。mainly through respiratory droplets produced when an infected person coughs or sneezes.
---
These droplets can land in the mouths or noses of people who are nearby or possibly be inhaled into the lungs.
---
Spread is more likely when people are in close contact with one another (within about 6 feet).
---
It may be possible that a person can get COVID-19 by touching a surface or object that has the virus on it and then touching their own mouth, nose, or possibly their eyes.
---
This is not thought to be the main way the virus spreads, but we are still learning more about this virus.
---


In [24]:
sentence_delimiters = ".。！？……!?\n；;﹗！!﹖？?"

def split_to_sentences(text, method="re"):
    if not text:
        return []

    sents = re.findall(
        f"[^{sentence_delimiters}]*[{sentence_delimiters}]+",
        text,
    )
    return sents if sents else [text]

#re.sub('(every).*?(\\s.*\\.)',r'\1body\2',src) # everybody is alright.

def preprocess(raw_text):
    raw_text = re.sub(r'([….。?﹖？!﹗！])([\s])?(\w+)', '\\1 \\3', raw_text)
    return raw_text

In [25]:
print(page_text)

埃斯特拉达·帕尔马是在十年战争中一个重要的古巴将军。
曾被西班牙军队俘虏并流放，在被放逐期间，他前往纽约，在那里他与何塞·马蒂一起工作。
马蒂死后，埃斯特拉达·帕尔马成为古巴革命党新的领导人。
他在华盛顿寻求。
援助，成功地使美国国会通过联合决议，这项法案是导致美国宣战西班牙的其中一个因素，美国介入了战争，史称美西战争，要求古巴摆脱西班牙殖民统治。
在伦纳德·伍德将军管理古巴几年后，举行了选举。
以何塞·米格尔·戈麦斯为首的共和党保守派和阿尔弗雷·扎亚斯为首的国家自由派，双方支持埃斯特拉达·帕尔玛。
他虽然停留的全部时间都在美国，在那儿他是一个公民。
1901年12月31日埃斯特拉达·帕尔马当选总统，他的政治取向如同美国总统西奥多·罗斯福開明。
美军离开后，古巴政府签署了一项法案，降低对美国产品关税，并将普拉特修正案纳入自己的宪法。1903年2月16日埃斯特拉达·帕尔玛签订了美国-古巴互惠条约，同意租借关塔那摩湾地区给美国，用作海军基地和煤站。
1906年埃斯特拉达·帕尔马连任，但这次自由派反对，称选举舞弊。
美国军事介入，不久帕尔马辞职，抗议罗斯福的意愿，查尔斯·马贡在古巴建立了另一个親美政府，美國对古巴实行军事占领。
个人生活.


In [26]:
text = '''It is thought to spread mainly from person to person。mainly through respiratory droplets produced when an infected person coughs or sneezes. These droplets can land in the mouths or noses of people who are nearby or possibly be inhaled into the lungs. Spread is more likely when people are in close contact with one another (within about 6 feet). It may be possible that a person can get COVID-19 by touching a surface or object that has the virus on it and then touching their own mouth, nose, or possibly their eyes. This is not thought to be the main way the virus spreads, but we are still learning more about this virus.'''
#split_to_sentences(preprocess(text))
tokenizer = PunktSentenceTokenizer(lang_vars = CustomLangVars())
for sent in tokenizer.tokenize(preprocess(text)):
    print(sent)
    print('---')

It is thought to spread mainly from person to person。
---
mainly through respiratory droplets produced when an infected person coughs or sneezes.
---
These droplets can land in the mouths or noses of people who are nearby or possibly be inhaled into the lungs.
---
Spread is more likely when people are in close contact with one another (within about 6 feet).
---
It may be possible that a person can get COVID-19 by touching a surface or object that has the virus on it and then touching their own mouth, nose, or possibly their eyes.
---
This is not thought to be the main way the virus spreads, but we are still learning more about this virus.
---


In [None]:

filtered_text = preprocess(page_text)
split_to_sentences(filtered_text)

In [None]:
print(filtered_text)

It is thought to spread mainly from person to person。
---
mainly through respiratory droplets produced when an infected person coughs or sneezes.
---
These droplets can land in the mouths or noses of people who are nearby or possibly be inhaled into the lungs.
---
Spread is more likely when people are in close contact with one another (within about 6 feet).
---
It may be possible that a person can get COVID-19 by touching a surface or object that has the virus on it and then touching their own mouth, nose, or possibly their eyes.
---
This is not thought to be the main way the virus spreads, but we are still learning more about this virus.
---
