# Ch 7. 機器學習應用
## 7-1. 自然語言處理
[7-1-1. 基本操作](#sec1)  
[7-1-2. 詞袋模型](#sec2)
***

<a id='sec1'></a>
## 7-1-1. 基本操作

In [1]:
texts = ['Pikachu is a short, Electric-type Pokémon introduced in Generation I!!!', 
         'It is covered in yellow fur with two horizontal brown stripes on its back. It has a small mouth, long, pointed ears with black tips, and brown eyes.',
         'It evolves from Pichu when leveled up with high friendship and evolves into Raichu.']
len(texts)

3

### 去除標點符號

In [2]:
import unicodedata
import sys

punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))
texts_no_punct = [s.translate(punctuation) for s in texts]
texts_no_punct

['Pikachu is a short Electrictype Pokémon introduced in Generation I',
 'It is covered in yellow fur with two horizontal brown stripes on its back It has a small mouth long pointed ears with black tips and brown eyes',
 'It evolves from Pichu when leveled up with high friendship and evolves into Raichu']

### 取出每個詞或句子

In [3]:
import nltk

# 第一次載入 nltk時，要先下載一些文件(需要等一會)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# 下載一組停止詞
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yclin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yclin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yclin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.tokenize import word_tokenize

words_lst = [word_tokenize(t) for t in texts_no_punct]
print(words_lst[0])

['Pikachu', 'is', 'a', 'short', 'Electrictype', 'Pokémon', 'introduced', 'in', 'Generation', 'I']


In [5]:
from nltk.tokenize import sent_tokenize

sent_tokenize(texts[1])

['It is covered in yellow fur with two horizontal brown stripes on its back.',
 'It has a small mouth, long, pointed ears with black tips, and brown eyes.']

### 移除停止詞

In [6]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print('停止詞：', stop_words[:5])
print('停止詞數量 =', len(stop_words))

for i in range(len(words_lst)):
    words_lst[i] = [w for w in words_lst[i] if w not in stop_words]

print(texts_no_punct[0])
print(words_lst[0])

停止詞： ['i', 'me', 'my', 'myself', 'we']
停止詞數量 = 179
Pikachu is a short Electrictype Pokémon introduced in Generation I
['Pikachu', 'short', 'Electrictype', 'Pokémon', 'introduced', 'Generation', 'I']


### 取出詞幹(stem)

In [7]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
stem_lst = []

for words in words_lst:
   stem_lst.append([porter.stem(w) for w in words])

print(words_lst[0])
print(stem_lst[0])

['Pikachu', 'short', 'Electrictype', 'Pokémon', 'introduced', 'Generation', 'I']
['pikachu', 'short', 'electrictyp', 'pokémon', 'introduc', 'gener', 'I']


### 標記詞性(Part-of-Speech, POS)
詞類標籤可參考 https://www.itread01.com/content/1547007495.html

In [8]:
from nltk import pos_tag

words_tag_lst = [pos_tag(w) for w in words_lst]
print(words_tag_lst[0])

[('Pikachu', 'NNP'), ('short', 'JJ'), ('Electrictype', 'NNP'), ('Pokémon', 'NNP'), ('introduced', 'VBD'), ('Generation', 'NNP'), ('I', 'PRP')]


In [9]:
# 搜尋特定詞類
[w for w, tag in words_tag_lst[0] if tag in ['NNP']]

['Pikachu', 'Electrictype', 'Pokémon', 'Generation']

### 轉換為詞類特徵向量

In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

tag_lst= []
for words_tag in words_tag_lst:
    tag_lst.append([tag for word, tag in words_tag])

mlb = MultiLabelBinarizer()
mlb.fit_transform(tag_lst)

array([[0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1]])

In [11]:
# 顯示特徵名稱
mlb.classes_

array(['CD', 'IN', 'JJ', 'NN', 'NNP', 'NNS', 'PRP', 'RB', 'VBD', 'VBP',
       'VBZ'], dtype=object)

In [12]:
data = mlb.fit_transform([{'皮卡丘', '雷丘'}, {'小火龍', '噴火龍'}, {'傑尼龜'}])
print(data)
list(mlb.classes_)

[[0 0 0 1 1]
 [0 1 1 0 0]
 [1 0 0 0 0]]


['傑尼龜', '噴火龍', '小火龍', '皮卡丘', '雷丘']

<a id='sec2'></a>
## 7-1-2. 詞袋模型

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['This is a small document.', 
          'Pokémon document is the second document.',
          'Pikachu is a small and Electric-type Pokémon.',
          'Is this the first document?']

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['document', 'electric', 'pikachu', 'pokémon', 'second', 'small', 'type']


In [14]:
import pandas as pd

df_vec = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_vec

Unnamed: 0,document,electric,pikachu,pokémon,second,small,type
0,1,0,0,0,0,1,0
1,2,0,0,1,1,0,0
2,0,1,1,1,0,1,1
3,1,0,0,0,0,0,0


In [15]:
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(2, 3))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())

['document second', 'document second document', 'electric type', 'electric type pokémon', 'pikachu small', 'pikachu small electric', 'pokémon document', 'pokémon document second', 'second document', 'small document', 'small electric', 'small electric type', 'type pokémon']


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(stop_words='english')
X = tf_idf.fit_transform(corpus)
print(tf_idf.get_feature_names())
tf_idf.idf_

['document', 'electric', 'pikachu', 'pokémon', 'second', 'small', 'type']


array([1.22314355, 1.91629073, 1.91629073, 1.51082562, 1.91629073,
       1.51082562, 1.91629073])

In [17]:
df_tf_idf = pd.DataFrame(X.toarray(), columns=tf_idf.get_feature_names())
df_tf_idf

Unnamed: 0,document,electric,pikachu,pokémon,second,small,type
0,0.629228,0.0,0.0,0.0,0.0,0.777221,0.0
1,0.707981,0.0,0.0,0.437249,0.554595,0.0,0.0
2,0.0,0.485461,0.485461,0.382743,0.0,0.382743,0.485461
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
import numpy as np
from sklearn.preprocessing import normalize

index = ['document', 'pokémon', 'second']
tf = np.array(df_vec.loc[1, index])/df_vec.loc[1, :].sum()
idf = np.array([tf_idf.idf_[tf_idf.vocabulary_[w]]for w in index])
tf_idf_doc1 = tf*idf
normalized = normalize(tf_idf_doc1.reshape(1,-1), norm='l2').ravel()

dct = {'TF': tf,
       'IDF': idf, 
       'TF-IDF': tf_idf_doc1, 
       '正規化': normalized}
df_doc1 = pd.DataFrame(dct, index=index)
df_doc1

Unnamed: 0,TF,IDF,TF-IDF,正規化
document,0.5,1.223144,0.611572,0.707981
pokémon,0.25,1.510826,0.377706,0.437249
second,0.25,1.916291,0.479073,0.554595
