# Ch 7. 機器學習應用
## 7-1. 自然語言處理

[7-1-3. 情感分析](#sec3)
 - [IMDB 影評數據](#sec3_1)
 - [短文情感分析](#sec3_2)
 - [簡易情感分析](#sec3_3)
***

<a id='sec3'></a>
## 7-1-3. 情感分析
<a id='sec3_1'></a>
### IMDB 影評數據

In [1]:
import numpy as np
import pandas as pd

size = 5000 # 只取部分樣本，節省運算時間

df = pd.read_csv('IMDb_dataset.csv')
df = df.sample(n=size, random_state=0)
df.reset_index(inplace=True, drop=True)
print(df['sentiment'].value_counts())
df.head(3)

negative    2553
positive    2447
Name: sentiment, dtype: int64


Unnamed: 0,review,sentiment
0,John Cassavetes is on the run from the law. He...,positive
1,It's not just that the movie is lame. It's mor...,negative
2,"Well, if it weren't for Ethel Waters and a 7-y...",negative


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder().fit(df['sentiment'])
df['sentiment'] = le.transform(df['sentiment'])
df.head(3)

Unnamed: 0,review,sentiment
0,John Cassavetes is on the run from the law. He...,1
1,It's not just that the movie is lame. It's mor...,0
2,"Well, if it weren't for Ethel Waters and a 7-y...",0


In [3]:
# 評論包含有 HTML 標籤、標點符號以及其他非字母字元(e.g., (, [)
df.loc[0, 'review'][-150:-100]

'ch needed.<br /><br />All the three principle char'

#### 移除或取代某些字元

In [4]:
from bs4 import BeautifulSoup
import re

def remove_noise(text):
    # 移除 HTML 標籤
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    # 移除中括號內的文字
    text = re.sub('\[[^]]*\]', '', text)
    
    # 將句點取代為空格
    text = text.replace('.', ' ')
    
    # 移除特殊字元、標點符號
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern,'',text)
    
    return text

df['review'] = df['review'].apply(remove_noise)
df.loc[0, 'review'][-150:-100]

' the time was much needed All the three principle '

#### 提取詞幹

In [5]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

# 提取詞幹
def get_stemming(text):
    text= ' '.join([porter.stem(w) for w in text.split()])   
    return text

df['review'] = df['review'].apply(get_stemming)
df.loc[0, 'review'][:50]

'john cassavet is on the run from the law He is at '

#### 移除停止詞

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopword_lst = stopwords.words('english')

# 移除停止詞
def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_lst]
    filtered_text = ' '.join(filtered_tokens)  
    
    return filtered_text

df['review'] = df['review'].apply(remove_stopwords)
df.loc[0, 'review'][:50]

'john cassavet run law bottom heap see negro sidney'

In [7]:
# 切割訓練集、測試集
train_size = 0.8

X_train = df.loc[:size*train_size-1, 'review'].values
y_train = df.loc[:size*train_size-1, 'sentiment'].values
X_test = df.loc[size*train_size:, 'review'].values
y_test = df.loc[size*train_size:, 'sentiment'].values

dct = {'總筆數': [X_train.shape[0], X_test.shape[0]], 
       '正評論筆數': [y_train.sum(), y_test.sum()], 
       '負評論筆數': [(y_train==0).sum(), (y_test==0).sum()]}
pd.DataFrame(dct, index=['訓練集', '測試集'])

Unnamed: 0,總筆數,正評論筆數,負評論筆數
訓練集,4000,1954,2046
測試集,1000,493,507


#### TF-IDF + 邏輯斯迴歸 + 網格搜尋

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

tf_idf = TfidfVectorizer()

pipe = Pipeline([('tfidf', tf_idf),
                 ('clf', LogisticRegression())])

param_grid = [{'tfidf__ngram_range': [(1, 1)],
               'tfidf__stop_words': ['english', None],
               'tfidf__use_idf':[True],
               'tfidf__norm':['l1', 'l2'],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': np.logspace(-2, 2, 10)},
              {'tfidf__ngram_range': [(1, 1)],
               'tfidf__stop_words': ['english', None],
               'tfidf__use_idf':[False],
               'tfidf__norm':['l1', 'l2'],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': np.logspace(-2, 2, 10)},
              ]

gs = GridSearchCV(pipe, param_grid, scoring='accuracy', 
                  cv=5, verbose=1, n_jobs=-1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   49.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

#### 搜尋到的最佳參數組合

In [9]:
print('Best parameters:', gs.best_params_)

Best parameters: {'clf__C': 4.6415888336127775, 'clf__penalty': 'l2', 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2', 'tfidf__stop_words': None, 'tfidf__use_idf': True}


#### 分類準確率

In [10]:
print('Train accuracy:', gs.best_score_)
clf = gs.best_estimator_
print('Test accuracy:', clf.score(X_test, y_test))

Train accuracy: 0.8512500000000001
Test accuracy: 0.866


<a id='sec3_2'></a>
### 短文情感分析
數據取得網址 https://github.com/UDICatNCHU/UdicOpenData

In [11]:
size = 5000 # 只取部分樣本，節省運算時間

df = pd.read_csv('pos_neg.csv')
print('短文數量 =', df.shape[0])
df = df.sample(n=size, random_state=0)
df.reset_index(inplace=True, drop=True)
print(df['sentiment'].value_counts())
df.head(5)

短文數量 = 34880
0    2687
1    2313
Name: sentiment, dtype: int64


Unnamed: 0,review,sentiment
0,1.預裝的LINUX會影響安裝XP或VISTA,0
1,"預裝linux,重新安裝XP裝驅動比較麻煩",0
2,內容空乏無物，語言譁衆取寵，全書不成系統，就象個蹩腳大夫行醫。這是我從噹噹網購的900多元錢...,0
3,鍵盤舒服。,1
4,房間很大，還有海景陽臺，走出酒店就是沙灘，非常不錯。唯一遺憾的就是不能刷銀聯卡，不方便。,1


#### 需先安裝 jieba 套件

In [12]:
import jieba

for idx in df.index:
    segment = jieba.cut(df.loc[idx, 'review'])
    df.loc[idx, 'review'] = ' '.join(segment)
    
df.head()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yclin\AppData\Local\Temp\jieba.cache
Loading model cost 0.683 seconds.
Prefix dict has been built successfully.


Unnamed: 0,review,sentiment
0,1 . 預裝 的 LINUX 會 影響 安裝 XP 或 VISTA,0
1,"預裝 linux , 重新 安裝 XP 裝驅動 比較 麻煩",0
2,內容 空乏 無物 ， 語 言 譁 衆 取 寵 ， 全書 不成 系統 ， 就 象個 蹩腳 大夫...,0
3,鍵盤 舒服 。,1
4,房間 很大 ， 還有 海景 陽 臺 ， 走出 酒店 就是 沙灘 ， 非常 不錯 。 唯一 遺...,1


#### 移除停止詞

In [13]:
stopword_lst = []

for line in open('stopwords.txt', encoding='utf8'):
    w = line.strip()
    stopword_lst.append(w)

len(stopword_lst)

1241

In [14]:
for idx in df.index:
    text = df.loc[idx, 'review']
    segment = [w for w in text.split(' ') if w not in stopword_lst]
    df.loc[idx, 'review'] = ' '.join(segment)

df.head()

Unnamed: 0,review,sentiment
0,預裝 LINUX 影響 安裝 XP VISTA,0
1,預裝 linux 重新 安裝 XP 裝驅動 比較 麻煩,0
2,內容 空乏 無物 語 言 衆 取 寵 全書 系統 象個 蹩腳 大夫 行醫 這是 噹噹 網購 ...,0
3,鍵盤 舒服,1
4,房間 很大 海景 陽 臺 走出 酒店 沙灘 不錯 唯一 遺憾 刷 銀聯卡 方便,1


In [15]:
# 切割訓練集、測試集
train_size = 0.8

X_train = df.loc[:size*train_size-1, 'review'].values
y_train = df.loc[:size*train_size-1, 'sentiment'].values
X_test = df.loc[size*train_size:, 'review'].values
y_test = df.loc[size*train_size:, 'sentiment'].values

dct = {'總筆數': [X_train.shape[0], X_test.shape[0]], 
       '正評論筆數': [y_train.sum(), y_test.sum()], 
       '負評論筆數': [(y_train==0).sum(), (y_test==0).sum()]}
pd.DataFrame(dct, index=['訓練集', '測試集'])

Unnamed: 0,總筆數,正評論筆數,負評論筆數
訓練集,4000,1825,2175
測試集,1000,488,512


In [16]:
param_grid = [{'tfidf__stop_words': [None],
               'tfidf__use_idf':[True],
               'tfidf__norm':['l1', 'l2'],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': np.logspace(-2, 2, 10)},
              {'tfidf__stop_words': [None],
               'tfidf__use_idf':[False],
               'tfidf__norm':['l1', 'l2'],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': np.logspace(-2, 2, 10)},
              ]

gs = GridSearchCV(pipe, param_grid, scoring='accuracy', 
                  cv=5, verbose=1, n_jobs=-1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   13.0s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [17]:
print('Best parameters:', gs.best_params_)
print('Train accuracy:', gs.best_score_)
clf = gs.best_estimator_
print('Test accuracy:', clf.score(X_test, y_test))

Best parameters: {'clf__C': 4.6415888336127775, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'tfidf__stop_words': None, 'tfidf__use_idf': True}
Train accuracy: 0.7945
Test accuracy: 0.811


<a id='sec3_3'></a>
### 簡易情感分析
- TextBlob (需先安裝 pip install -U textblob)

In [18]:
from textblob import TextBlob

text = 'Pokémon is a great game. Gigantamax Pikachu is terrible.'

blob = TextBlob(text)
print(blob.sentences[0].sentiment)
print(blob.sentences[1].sentiment)

Sentiment(polarity=0.2, subjectivity=0.575)
Sentiment(polarity=-1.0, subjectivity=1.0)


In [19]:
blob.sentiment

Sentiment(polarity=-0.19999999999999998, subjectivity=0.7166666666666667)

 - SnowNLP (需先安裝 pip install snownlp)

In [20]:
from snownlp import SnowNLP

text = u"訓練家小智屢敗屢戰，總算獲得聯盟冠軍。"
s = SnowNLP(text)
for sen in s.sentences:
    print(sen, '-> 表達正面情感的機率：', SnowNLP(sen).sentiments)

訓練家小智屢敗屢戰 -> 表達正面情感的機率： 0.010584349812285954
總算獲得聯盟冠軍 -> 表達正面情感的機率： 0.2698977405428781
