## 英文斷詞

In [1]:
s = 'this is a book'
s.split()

['this', 'is', 'a', 'book']

## 中文斷詞

In [11]:
import jieba
jieba.cut('酸民婉君也可以報名嗎?')

<generator object Tokenizer.cut at 0x104638408>

In [12]:
list(jieba.cut('酸民婉君也可以報名嗎?'))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/46/b7dzk4mn6g54qzptv608w7d00000gn/T/jieba.cache
Loading model cost 1.133 seconds.
Prefix dict has been built successfully.


['酸民婉君', '也', '可以', '報名', '嗎', '?']

## Generator (補充資料)

generator是一種特別的iterator，可以透過yield 讓我們自行控制iterator 的狀態，決定是否要呼叫__next__讓他進行迭代行為。Generator 採取Lazy Evaluation，真正需要資料時才進行計算。

優點：

1. 不需要定義__next，提高程式碼可讀性
2. 減少記憶體的使用
3. 可以用'yield from'實作遞迴的迭代行為
4. 可以建立一個可拆式的資料管線，減少了不同工作階段的耦合性

## Jieba 斷詞

In [13]:
import jieba
seg_list = jieba.cut("大巨蛋案對市府同仁下封口令？　柯P否認", cut_all=True)
print("Full Mode:", "/ ".join(seg_list))

Full Mode: 大/ 巨蛋/ 案/ 對/ 市府/ 同仁/ 下/ 封口/ 封口令/ 口令/ ？/ 　/ / 柯/ P/ 否/ 認


In [14]:
seg_list = jieba.cut("大巨蛋案對市府同仁下封口令？　柯P否認", cut_all=False)
print("Default Mode:", "/ ".join(seg_list))

Default Mode: 大/ 巨蛋/ 案對/ 市府/ 同仁/ 下/ 封口令/ ？/ 　/ 柯/ P/ 否認


In [16]:
jieba.get_dict_file().name

'/Users/davidchiu/.pyenv/versions/3.7.3/lib/python3.7/site-packages/jieba/dict.txt'

In [18]:
jieba.load_userdict("userdict.txt")

## Jieba 繁體
-  https://github.com/ldkrsi/jieba-zh_TW

In [19]:
import jieba.posseg as pseg
words = pseg.cut("大巨蛋案對市府同仁下封口令？　柯P否認")
for w in words:
    print(w.word, w.flag)


大 a
巨蛋 n
案 ng
對 p
市府 n
同仁 nr
下 f
封口令 n
？ x
　 x
柯 nr
P eng
否認 v


In [20]:
sentence = "大巨蛋案對市府同仁下封口令？　柯P否認"
words = jieba.cut(sentence, cut_all=False)
print("/ ".join(words))

jieba.add_word('柯P',100, 'nr')
jieba.add_word('大巨蛋',100, 'ns')


大/ 巨蛋/ 案對/ 市府/ 同仁/ 下/ 封口令/ ？/ 　/ 柯/ P/ 否認


## Ckiptagger
- https://github.com/ckiplab/ckiptagger

In [None]:
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
ws = WS("./data")
pos = POS("./data")
ner = NER("./data")
sentence_list = [
    "全聯福利中心…",
]

word_sentence_list = ws(
    sentence_list,
)
pos_sentence_list = pos(word_sentence_list)
entity_sentence_list = ner(word_sentence_list, pos_sentence_list)

In [None]:
word_to_weight = {
    "土地公": 1,
    "土地婆": 1,
    "公有": 2,
    "": 1,
    "來亂的": "啦",
    "緯來體育台": 1,
}
dictionary = construct_dictionary(word_to_weight)
print(dictionary)

## NLTK

In [23]:
import nltk
nltk.download('punkt')
text = """On the day when the government's top infectious disease specialist Anthony Fauci said he would not be surprised to see the US record 100,000 new coronavirus cases per day, Trump refused to break his deafening silence."""

sentences = nltk.sent_tokenize(text)



ModuleNotFoundError: No module named 'nltk'

In [None]:
from nltk.tokenize import word_tokenize
tokens = [word_tokenize(sent) for sent in sentences]



In [None]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')
s.stem('graphics')
s.stem("imaging")
s.stem("image")
s.stem("imagination")
s.stem("imagine")



## Spacy

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

## 文字雲

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)
wordcloud.to_image()