# Chapter 5

### 名目類型特徵編碼(LabelBinarizer)

In [16]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
feature = np.array([["Texas"], ["California"], ["Texas"], ["Delaware"], ["Texas"]])
one_hot = LabelBinarizer()

In [18]:
# 用one_hot 做特徵編碼
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [20]:
# 檢視特徵類型
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [22]:
# 反向編碼
one_hot.inverse_transform(one_hot.fit_transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

### 甚至可以使用Pandas做

In [24]:
feature

array([['Texas'],
       ['California'],
       ['Texas'],
       ['Delaware'],
       ['Texas']], dtype='<U10')

In [30]:
import pandas as pd
pd.get_dummies(feature[:, 0]) # 出來直接是一個DataFrame

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


### 多重特徵編碼

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer
muticlass_feature = [("Texas", "Florida"),
                    ("California", "Alabama"),
                    ("Texas", "Florida"),
                    ("Delware", "Florida"),
                    ("Texas", "Alabama")]
one_hot_multicalass = MultiLabelBinarizer()
one_hot_multicalass.fit_transform(muticlass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [33]:
one_hot_multicalass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

###  有序類型特徵編碼

In [37]:
df = pd.DataFrame({"Score":["Low", "Low", "Medium", "Medium", "High"]})
scale_mapper = {"Low": 1, "Medium":2, "High":3 }
df["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

### 特徵字典編碼 

In [49]:
from sklearn.feature_extraction import DictVectorizer

data_dict = [{"Red":2, "Blue":4}, {"Red":4, "Blue":3}, {"Red":1, "Yellow":2}, {"Red":2, "Yellow":2}]
dictvectorizer = DictVectorizer(sparse=False) # Dense matrix 降低記憶體需求
features = dictvectorizer.fit_transform(data_dict)
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [51]:
# 取得特徵名稱
feature_names = dictvectorizer.get_feature_names()
feature_names

['Blue', 'Red', 'Yellow']

In [52]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [53]:
# 假設有四份文件(NLP)
doc_1 = {"Red":2, "Blue":4}
doc_2 = {"Red":4, "Blue":3}
doc_3 = {"Red":1, "Yellow":2}
doc_4 = {"Red":2, "Yellow":2}
doc_word_counts = [doc_1, doc_2, doc_3, doc_4]
dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

### 缺漏類型值的推算 

In [57]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1,-0.21,-1.19]])
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan,-0.67,-0.22]])
clf = KNeighborsClassifier(3, weights ='distance')
trained_model = clf.fit(X[:, 1:], X[:, 0])
# 預測模型
imputed_values = trained_model.predict(X_with_nan[:, 1:])
# 合成預測值跟原始
X_with_imputed = np.hstack((imputed_values.reshape(-1, 1), X_with_nan[:, 1:]))
# 兩個矩陣[X_with_nan, X]結合
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

#### 用特徵頻率最高的值來補空值 

In [60]:
from sklearn.preprocessing import Imputer
X_complete = np.vstack((X_with_nan, X)) # 含空值的合併
imputer = Imputer(strategy='most_frequent', axis=0)
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

#### Notes:最好使用機器學習演算法來預測及填補資料，最常用KNN

### 處理高度不平衡類型的目標向量 

In [64]:
# 生成資料
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [79]:
iris = load_iris()
features = iris.data
target = iris.target

In [80]:
features = features[40:, :]
target = target[40:]
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [81]:
target = np.where((target==0), 0, 1)
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [86]:
# 產生權重(?)
weights={0: .9, 1: 0.1}
RandomForestClassifier(class_weight=weights)
# 自動產生死類型出現頻率成反比的權重
# RandomForestClassifier(class_weight='balanced')

RandomForestClassifier(bootstrap=True, class_weight={0: 0.9, 1: 0.1},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators='warn', n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

#### Notes:或者我們可以向下取樣，針對多數類型進行隨機取樣已產生相當於少數類型的新子集 

#### 向下取樣

In [103]:
# 跟下面比較看看哪裡不同
i_class0 = np.where(target==0)
i_class0

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),)

In [104]:
i_class0 = np.where(target==0)[0]
i_class1 = np.where(target==1)[0]
i_class0

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [105]:
print(len(i_class0), len(i_class1))

10 100


In [106]:
i_class1_downsampled = np.random.choice(i_class1, size=len(i_class0), replace=False) # replace=False取後不放回

In [107]:
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [109]:
np.vstack((features[i_class0, :], features[i_class1_downsampled, :]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

#### 向上取樣

In [113]:
i_class0_upsampled = np.random.choice(i_class0, size=len(i_class1), replace=True) # replace=True取後放回
np.hstack((target[i_class0_upsampled], target[i_class1]))
# 也可以用concatenate!
# np.concatenate((target[i_class0_upsampled], target[i_class1]))
np.vstack((features[i_class0_upsampled, :], features[i_class1, :]))[0:5]

array([[5. , 3.5, 1.6, 0.6],
       [5. , 3.5, 1.6, 0.6],
       [5. , 3.5, 1.6, 0.6],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6]])

### 處理不平衡的策略 

1. 增加數據
2. 使用模型評估指標(Confusion Matrix, F1-score, ROC curves)
3. 使用類型權重參數，(class_weight=?)
4. 向上或是向下取樣(通常兩種都會試)

# Chapter 6

### 文本清理

In [118]:
# 範例
text_data = ["   Hello. By Shao Yun   ",
             "Parking And Gaming. By Hong Lee",
             "    Today Is the night. By Zhao Zhoo"]
text_data

['   Hello. By Shao Yun   ',
 'Parking And Gaming. By Hong Lee',
 '    Today Is the night. By Zhao Zhoo']

In [119]:
# 除去前後空格
strip_whitespace = [string.strip() for string in text_data]
strip_whitespace

['Hello. By Shao Yun',
 'Parking And Gaming. By Hong Lee',
 'Today Is the night. By Zhao Zhoo']

In [120]:
# 移除句點
remove_periods = [string.replace(".", "") for string in strip_whitespace]
remove_periods

['Hello By Shao Yun',
 'Parking And Gaming By Hong Lee',
 'Today Is the night By Zhao Zhoo']

In [121]:
def capitalizer(string: str) -> str: # function annotation.`
    return string.upper()

In [122]:
[capitalizer(string) for string in remove_periods]

['HELLO BY SHAO YUN',
 'PARKING AND GAMING BY HONG LEE',
 'TODAY IS THE NIGHT BY ZHAO ZHOO']

In [126]:
# Regex example
import re
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string) # [a-z][A-Z]≠[a-zA-Z]
[replace_letters_with_X(string) for string in remove_periods]

['XXXXX XX XXXX XXX',
 'XXXXXXX XXX XXXXXX XX XXXX XXX',
 'XXXXX XX XXX XXXXX XX XXXX XXXX']

### HTML 解析與清理 

In [3]:
from bs4 import BeautifulSoup
html = """
        <div class ='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>
          """
soup = BeautifulSoup(html, 'lxml')

In [5]:
soup.find("div", {"class": "full_name"}).text

'Masego Azra'

### 標點符號的移除

In [16]:
import unicodedata
import sys
text_data = ["Hi!!!! I. Love. This Song....", "10000% Agree!!!! #LoveIT", "Right?!?!"]
# 產生標點符號字元字典
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
# 將每個字串標點符號移除
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

In [18]:
# 字典中是unicode當key, 用None為值, 然後將字串裡所有字元轉為None(By translate method)
punctuation

{33: None,
 34: None,
 35: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 95: None,
 123: None,
 125: None,
 161: None,
 167: None,
 171: None,
 182: None,
 183: None,
 187: None,
 191: None,
 894: None,
 903: None,
 1370: None,
 1371: None,
 1372: None,
 1373: None,
 1374: None,
 1375: None,
 1417: None,
 1418: None,
 1470: None,
 1472: None,
 1475: None,
 1478: None,
 1523: None,
 1524: None,
 1545: None,
 1546: None,
 1548: None,
 1549: None,
 1563: None,
 1566: None,
 1567: None,
 1642: None,
 1643: None,
 1644: None,
 1645: None,
 1748: None,
 1792: None,
 1793: None,
 1794: None,
 1795: None,
 1796: None,
 1797: None,
 1798: None,
 1799: None,
 1800: None,
 1801: None,
 1802: None,
 1803: None,
 1804: None,
 1805: None,
 2039: None,
 2040: None,
 2041: None,
 2096: None,
 2097: None,
 2098: None,
 2099: None,
 2100: None,
 2101: None,


#### Note:有時候標點符號給的資訊很重要，例如問句跟肯定句

### 文本的記號化(Tokenize)

In [21]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sychen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [22]:
from nltk.tokenize import word_tokenize
string = "The science of today is the technology of tomorrow"
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

### Token轉為句子

In [27]:
from nltk.tokenize import sent_tokenize
string = "The science of today is the technology of tomorrow. Tomorrow is today"
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today']

### StopWords 移除

In [30]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
tokenized_words=["i", "am", "going", "to", "go", "to", "the", "store", "and", "park"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sychen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

#### Notes: stopwords要求token都為小寫

### 詞幹提取(Stemmer)

In [36]:
from nltk.stem.porter import PorterStemmer
tokenizer_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
porter = PorterStemmer()
[porter.stem(word) for word in tokenizer_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

### 詞類標記

In [38]:
from nltk import pos_tag
from nltk import word_tokenize
import nltk
nltk.download('averaged_perceptron_tagger')
text_data = "Chris loved outdoor running"
text_tagged = pos_tag(word_tokenize(text_data))
text_tagged

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sychen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

#### Notes: PennTreeBank 

標籤 | 詞類
----|-------
NNP |Proper noun, singular
NN|Noun, singular or mass
RB|Adverb
VBD|Verb, past tense
VBG|Verb, gerund or present participle
JJ|Adjective
PRP|Personal pronoun

In [39]:
[word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

['Chris']

In [45]:
tweets = ["I am eating a burrito for breakfast", "Political science is an amazing field", "San Francisco is an awesome city"]
tagged_tweets = []
for tweet in tweets:
        tweet_tag = nltk.pos_tag(word_tokenize(tweet))
        tagged_tweets.append([tag for word, tag in tweet_tag])
tagged_tweets

[['PRP', 'VBP', 'VBG', 'DT', 'NN', 'IN', 'NN'],
 ['JJ', 'NN', 'VBZ', 'DT', 'JJ', 'NN'],
 ['NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN']]

In [47]:
from sklearn.preprocessing import MultiLabelBinarizer
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [48]:
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

#### Notes: 訓練tagger的方法，使用的語料庫是Brown Corpus，運用backoff n-gram tagger，n指的是預測該詞之詞類標籤時所要採計的前詞數目，首先我們用TrigraamTagger考慮前兩個詞來進行預測，若沒有則退後用BigramTagger考慮前一個詞，在沒有才用UnigramTagger考慮詞本身

In [51]:
from nltk.corpus import brown
import nltk
nltk.download('brown')
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
# 從Brown語料庫獲得文本，並將它拆解成句子
sentence = brown.tagged_sents(categories='news')
train = sentence[:4000]
test = sentence[4000:]
# 退後tagger
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)
trigram.evaluate(test)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\sychen\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


0.8174734002697437

### 將文本編碼成詞袋(CountVectorizer)

In [53]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])

In [57]:
# 產生詞袋特徵矩陣
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
bag_of_words # <3x8 sparse matrix of type '<class 'numpy.int64'>'
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [59]:
# 看特徵名稱
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [61]:
import pandas as pd
df = pd.DataFrame(bag_of_words.toarray(), columns=count.get_feature_names())
df

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0


#### Notes: bag-of-words model 會為每一個唯一詞輸出一個特徵，而像是上面的Brazil值為2是因為出現了兩次。
#### 阿其實還是可以將每個特徵設定為兩個詞一個組合(2-gram)，甚至是三個詞(3-gram) 下面展示方法

In [64]:
count_2gram = CountVectorizer(ngram_range=(1, 2), stop_words='english', vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)
bag.toarray()

array([[2],
       [0],
       [0]], dtype=int64)

In [67]:
count_2gram.vocabulary_ # 出現兩次以上的詞

{'brazil': 0}

### 詞的權重(TfidVectorizer)

In [77]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [78]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

#### Notes: 一詞在文件中次數出現越多，該詞對該文件的重要性通常會比較高，我們稱為詞頻(term frequency, tf)
#### 相對的，若一詞出現在許多文件中，該詞對個別文件的重要性就會降低，我們稱之為文件頻率(document frequency, df)
#### 結合上述，我們可以為每一個詞指定一個代表該詞在一份文件中的重要性。簡單來說就是 tf * idf (tf乘df的倒數)

$$tf-idf(t, d) = tf(t,d)\times idf(t)$$

$$idf(t)=log\frac{1+n_d}{1+df(d,t)}+1$$

$$n_d是文件數量，df(d,t)則是詞，t，的文件頻率(即內涵該詞的文件數量)。scikit-learn會使用L2 norm來對tf-idf向量做normalize$$
