refer to 
* https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb#scrollTo=dsBo6RCtQmwx

In [1]:
import pandas as pd
import tensorflow as tf

2022-06-30 12:42:06.665239: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/lib
2022-06-30 12:42:06.665284: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
from tensorflow import keras
import os
import re

In [3]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df, test_df


In [4]:
train, test = download_and_load_datasets()

In [5]:
train.shape

(25000, 3)

In [6]:
train.head()

Unnamed: 0,sentence,sentiment,polarity
0,This movie has one of the cheesiest plots I ha...,10,1
1,I really liked this movie...it was cute. I enj...,8,1
2,The Legend of Bloody Jack is set in the Alaska...,3,0
3,This is the worst movie I have ever seen. I wa...,1,0
4,This movie seemed like it was going to be bett...,2,0


In [7]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [8]:
import tokenization

In [9]:
tokenizer = tokenization.FullTokenizer(
      vocab_file="../vocab.txt")  # 中文BERT以字为最终的token

In [10]:
tokenizer.tokenize('woshssDfsfd')

['w', '##os', '##hs', '##sd', '##fs', '##f', '##d']

In [11]:
tokenizer.tokenize('woshssdfsfd')

['w', '##os', '##hs', '##sd', '##fs', '##f', '##d']

In [50]:
type(tokenizer.tokenize('大美边疆'))

list

In [13]:
tokenizer.tokenize('哎呦喂hellod')

['哎', '呦', '喂', 'hello', '##d']

In [14]:
from transformers import BertTokenizer

In [15]:
hf_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [16]:
hf_tokenizer.tokenize("哎呦喂hellod")

['哎', '呦', '喂', 'hello', '##d']

In [17]:
hf_tokenizer.tokenize("大美边疆")

['大', '美', '边', '疆']

In [18]:
hf_tokenizer.vocab_size

21128

In [19]:
import re
# 句子分割, sentence segmentation
# https://stackoverflow.com/questions/27441191/splitting-chinese-document-into-sentences

def zng(paragraph):
    for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U):
        yield sent

# list(zng(paragraph))

In [44]:
paragraph = "比尔刚刚年满29岁，在此前结束的赛季中，受伤病困扰，他场均只得到23.2分4.7篮板6.6助攻。而在此之前的两个赛季中，他的场均得分均在30分以上。"

In [45]:
print(paragraph)

比尔刚刚年满29岁，在此前结束的赛季中，受伤病困扰，他场均只得到23.2分4.7篮板6.6助攻。而在此之前的两个赛季中，他的场均得分均在30分以上。


In [46]:
list(zng(paragraph))

['比尔刚刚年满29岁，在此前结束的赛季中，受伤病困扰，他场均只得到23.',
 '2分4.',
 '7篮板6.',
 '6助攻。',
 '而在此之前的两个赛季中，他的场均得分均在30分以上。']

In [29]:
!pip3 install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.1/130.1 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting

还是使用spacy的sentence recognizer，效果更好点

In [32]:
!python3 -m spacy download zh_core_web_md

2022-06-30 12:46:25.264839: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/lib
I0000 00:00:1656564386.738417  564582 tpu_initializer_helper.cc:165] libtpu.so already in use by another process probably owned by another user. Run "$ sudo lsof -w /dev/accel0" to figure out which process is using the TPU. Not attempting to load libtpu.so in this process.
Defaulting to user installation because normal site-packages is not writeable
Collecting zh-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.3.0/zh_core_web_md-3.3.0-py3-none-any.whl (77.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkus

In [33]:
import spacy

nlp = spacy.load("zh_core_web_md")

In [40]:
# paragraph = "取消“星号”标记，不会因此造成新冠传播风险的增加。“星号”标记仅仅提示某人来自有高风险区或中风险区的城市，并没有与具体的高风险区或中风险区直接挂钩，即不能反映“星号”标记者的实际风险高低。随着科学防控、精准施策的不断完善，对于高风险区/中风险区的划分和防控管理要求更加精细化、精准化，通信行程卡“星号”标记的提示作用，在新防控策略的具体实施中的难以发挥作用。"
paragraph = "“从警28年，我的经历很简单，就是在一线做巡逻警员、冲锋队警员。”香港警务处新界南总区警署警长陈连生向《环球时报》记者回忆，1997年7月1日，刚进入警队3年的他在会场外做安保工作。“当时很开心，在很近的距离看漫天烟花。我们每人口袋里都准备了新帽徽，0点一到，指挥官一声命令，就很小心地将新帽徽安上。那一刻我真觉得自己是很完整、很真实的一个中国人！”"

In [47]:
doc = nlp(paragraph)

In [48]:
# assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)

比尔刚刚年满29岁，在此前结束的赛季中，受伤病困扰，他场均只得到23.
2分4.7篮板6.6助攻。
而在此之前的两个赛季中，他的场均得分均在30分以上。
