<a href="https://colab.research.google.com/github/yeee457984/SentimentAnalysis_NLP-and-RNN/blob/main/SentimentAnalysis_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 環境設定

In [None]:
import os
if not os.path.isdir("HappyML"):
  os.system("git clone https://github.com/cnchi/HappyML.git")

Dataset_File = "Comments_Hotels.xlsx"
if not os.path.isfile(Dataset_File):
  os.system("wget https://raw.githubusercontent.com/cnchi/datasets/master/" + Dataset_File)

In [None]:
import torch
import torch.nn as nn
import HappyML.preprocessor as pp
from HappyML.pytorch import Sequential

In [None]:
# 檢查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [None]:
!pip install jieba

# 斷詞專用，繁中字定義辭典
Dictionary_file = 'dict.txt.big'
if not os.path.isfile(Dictionary_file):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + Dictionary_file)

# 繁中停止詞辭典
StopWords_File = "stopWords_big5.txt"
if not os.path.isfile(StopWords_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + StopWords_File)

StopWords_Set = set()
with open(StopWords_File, "rt", encoding="utf-8") as f:
  for line in f:
    line = line.strip() # Remove trailing \n
    StopWords_Set.add(line)

# 中文標點符號集合
Punctuation_Set = set("$!&#%\()+-*/_,. 　?:;'\"<=>^`|~[]{}’0123456789?_“”、。《》！，：；？「」（）")



# 資料前處理

In [None]:
import pandas as pd
dataset = pd.read_excel(Dataset_File)
dataset.head()

Unnamed: 0,label,review
0,1,"距離川沙公路較近,但是公車指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較..."
1,1,商務大床房，房間很大，床有2M寬，整體感覺經濟實惠不錯!
2,1,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。
3,1,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...
4,1,"CBD中心,周圍沒什麼店鋪,說5星有點勉強.不知道為什麼洗手間沒有吹風機"


**切分自變數應變數**

In [None]:
X,Y = pp.decomposition(dataset, x_columns=[1], y_columns=[0])

**斷詞、去除標點符號、停止詞**

In [None]:
import jieba
jieba.set_dictionary(Dictionary_file)

def jieba_tokenizer(text):
  tokenized_result = list(jieba.cut(text))
  tokenized_result = [token for token in tokenized_result if token not in Punctuation_Set]
  tokenized_result = [token for token in tokenized_result if token  not in StopWords_Set]
  return " ".join(tokenized_result)

X['review'] = X['review'].apply(jieba_tokenizer)
X.head()

Building prefix dict from /content/dict.txt.big ...
DEBUG:jieba:Building prefix dict from /content/dict.txt.big ...
Loading model from cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
Loading model cost 1.171 seconds.
DEBUG:jieba:Loading model cost 1.171 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['review'] = X['review'].apply(jieba_tokenizer)


Unnamed: 0,review
0,距離 川沙 公路 較近 公車 指示 蔡陸線 會 非常 麻煩 建議 路線 房間 簡單
1,商務 大床 房 房間 很大 床有 2M 寬 整體 感覺 經濟 實惠 不錯
2,早餐 太差 不加 食品 酒店 應該 重視 一下 問題 房間
3,賓館 街道 不大好 找 還好 北京 熱心 同胞 很多 賓館 設施 介紹 差不多 房間 很小 ...
4,CBD 中心 周圍 沒什麼 店鋪 說 星 有點 勉強 知道 洗手間 沒有 吹風機


**文字數位化**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# 設定字典總字數最大上限（None = 不設限）
MAX_NUM_WORDS = 10000

# 產生一個數位化物件
tk = Tokenizer(
        num_words=MAX_NUM_WORDS,
        filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
        lower=True, # 遇到英文就轉小寫
        split=' ', # 以空白為斷詞依據
        char_level=False, # 是否斷詞至字母層級
        oov_token='NiD'
    )

In [None]:
# 文字轉數字
tk.fit_on_texts(X['review'])

# 印出文字數字對照辭典
print(tk.word_index)
print(tk.index_word)

# 文本轉數字
seq = tk.texts_to_sequences(X['review'])
print(seq[:10])

# 數字轉文字（檢查用）
text = tk.sequences_to_texts(seq)
print(text[:10])

{'NiD': 1, '酒店': 2, '房間': 3, '不錯': 4, '服務': 5, '沒有': 6, '住': 7, '入住': 8, '比較': 9, '早餐': 10, '感覺': 11, '說': 12, '非常': 13, '前台': 14, '一個': 15, '方便': 16, '攜程': 17, '設施': 18, '服務員': 19, '賓館': 20, '月': 21, '價格': 22, '日': 23, '環境': 24, '客人': 25, '元': 26, '晚上': 27, '年': 28, '房': 29, '沒': 30, '乾淨': 31, '下次': 32, '會': 33, '有點': 34, '洗手間': 35, '評論': 36, '時': 37, '2008': 38, '免費': 39, '差': 40, '吃': 41, '裝修': 42, '這家': 43, '地方': 44, '位置': 45, '不好': 46, '選擇': 47, '餐廳': 48, '問題': 49, '知道': 50, '大堂': 51, '交通': 52, '很多': 53, '總體': 54, '態度': 55, '不能': 56, '太': 57, '已經': 58, '10': 59, '一點': 60, '性價比': 61, '居然': 62, '補充': 63, '大床': 64, '特別': 65, '發現': 66, '應該': 67, '建議': 68, '算': 69, '退房': 70, '高': 71, '舒服': 72, '實在': 73, '一下': 74, '床': 75, '朋友': 76, '很大': 77, '樓': 78, '空調': 79, '可能': 80, '滿意': 81, '以後': 82, '推薦': 83, '希望': 84, '人員': 85, '隔音': 86, '分鐘': 87, '裡': 88, '時間': 89, '回饋': 90, '提供': 91, '預定': 92, '看到': 93, '要求': 94, '訂': 95, '不會': 96, '標準': 97, '電話': 98, '覺得': 99, '挺': 100, '走': 101, '豪華': 102, 

**序列對齊**

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 指定要截長補短至多少個單字
#1. 先算一句話平均有幾個詞 = 42.7
#2. 想找 2^n 且小一點的數字 >> 32
MAX_SEQUENCE_LENGTH = 32

# 建立一個序列對齊物件
padded_seq = pad_sequences(
        sequences=seq,
        maxlen=MAX_SEQUENCE_LENGTH,
        dtype="int32",
        padding="pre", # 補字就補在前面
        truncating="post", # 砍字的話就砍後面
        value=0 # 不足捕0
    )

# 印出前幾筆做為驗證
print(padded_seq[:10])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0  283    1 3691 2405  677 2520    1   33   13  678
    68  598    3  456]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0  128   64   29    3   77 9890 6436 1730  193
    11  642  441    4]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   10  156 7757 1331    2   67
  1300   74   49    3]
 [  20 2521 3436  104  198  340 1731    1   53   20   18  375  379    3
   244  186  100 1098 9891 2522    1 7758   24    4 1627  132  326  813
     1 1569  704  859]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 6437  428  194  199 6438   12  106   34  968
    50   35    6  898]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    2 2120   22   69   84 2653   42
   215   25    1  333

In [None]:
# 重新合成為自變數 X
column_names = [f"word_{i}" for i in range(MAX_SEQUENCE_LENGTH)]
X = pd.DataFrame(padded_seq, columns=column_names)

# 印出前幾筆做為驗證
X.head()

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,...,word_22,word_23,word_24,word_25,word_26,word_27,word_28,word_29,word_30,word_31
0,0,0,0,0,0,0,0,0,0,0,...,677,2520,1,33,13,678,68,598,3,456
1,0,0,0,0,0,0,0,0,0,0,...,3,77,9890,6436,1730,193,11,642,441,4
2,0,0,0,0,0,0,0,0,0,0,...,10,156,7757,1331,2,67,1300,74,49,3
3,20,2521,3436,104,198,340,1731,1,53,20,...,24,4,1627,132,326,813,1,1569,704,859
4,0,0,0,0,0,0,0,0,0,0,...,199,6438,12,106,34,968,50,35,6,898


**切分訓練集、測試集 & 轉換成tensor**

In [None]:
# Dataframe
X_train, X_test, Y_train, Y_test = pp.split_train_test(x_ary=X, y_ary=Y)

# 轉 nd.array 再轉 tensor
X_train_tensor = torch.tensor(X_train.values, dtype = torch.long).to(device)
Y_train_tensor = torch.tensor(Y_train.values, dtype = torch.float).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype = torch.long).to(device)
Y_test_tensor = torch.tensor(Y_test.values, dtype = torch.float).to(device)

# 模型建造

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size): # input_size:10000, hidden_size:128
        super(RNN, self).__init__()        # 先初始化 nn.Module（父類別）
        # 隱藏層
        self.hidden_size = hidden_size        # 將隱藏層初始層數記下來備用
        # 將 10000 字的辭典，重新編碼為以 hidden 這麼多節點表示(10000 >> 128)
        # 嵌入層
        self.embedding = nn.Embedding(input_size, hidden_size)
        # RNN層
        self.rnn = nn.LSTM(hidden_size, (hidden_size+output_size)//2, batch_first=True)
        #全連接層
        self.fc = nn.Linear((hidden_size+output_size)//2, output_size)
        self.sigmoid = nn.Sigmoid()        # 二選一，激活函數使用 Sigmoid

    # 前向傳播，真正讓張量通過每一層神經層
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        output = self.sigmoid(output)
        return output

In [None]:
model = RNN(MAX_NUM_WORDS, 128, 1).to(device)
criterion = nn.BCELoss() # 用二選一專用的 Binary Cross Entroy 做為損失函數
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # 指定使用 Adam 做為優化器

# 模型訓練

In [None]:
num_epochs = 500

# 開始訓練模型
for epoch in range(num_epochs):
    model.train()    # 先將模型設為「訓練模式」
    outputs = model(X_train_tensor)    # 透過自變數 X 估計應變數 Y 的值（Y_pred）
    loss = criterion(outputs, Y_train_tensor)  # 比較 Y_pred 與 Y_real，取得損失值
    optimizer.zero_grad()    # 將這次算出來的梯度（預計優化的方向）歸零
    loss.backward()     # 反向傳播，計算出這次的梯度（權重預計優化的方向）
    optimizer.step()     # 依照計算出來的梯度，真正地將權重更新

    # 計算這一次的「確度（Accuracy）」
    predicted = (outputs > 0.5).float()
    correct = (predicted == Y_train_tensor).float().sum()
    accuracy = correct / Y_train_tensor.shape[0]

    # 印出這一回合的損失值與確度
    if (epoch+1) % 1 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4%}, Accuracy: {accuracy.item():.4%}')

# 模型評估

In [None]:
# 將模型設為「評估模式」
model.eval()

# 關閉整個神經網路的梯度計算機制
with torch.no_grad():
    outputs = model(X_test_tensor)     # 向前傳播，透過自變數 X 預測應變數 Y
    # 計算 Accuracy
    outputs = (outputs > 0.5).float()
    accuracy = (outputs == Y_test_tensor).float().mean()
    print(f'Accuracy: {accuracy.item():.4%}')

# 模型預測

In [None]:
# 撰寫一個可以丟入一條評論、輸出情緒分析結果的函數
def predict_sentiment(sentence):
    # 先將模型設為「評估模式」
    model.eval()
    # 關閉整個神經網路的梯度計算機制
    with torch.no_grad():
        # 利用先前自己撰寫的斷詞函數斷詞
        sent = jieba_tokenizer(sentence)
        # 將斷詞結果轉換成數字
        seq = tk.texts_to_sequences([sent])
        # 將打算送入神經網路的輸入值「序列對齊」
        padded_seq = pad_sequences(
                sequences=seq,
                maxlen=MAX_SEQUENCE_LENGTH,
                dtype="int32",
                padding="pre",
                truncating="post",
                value=0
            )
        # 將輸入值轉換成 PyTorch Tensor
        X_tensor = torch.tensor(padded_seq, dtype=torch.long).to(device)
        # 丟入模型，做「前向傳播」
        output = model(X_tensor)
        # 把情緒分數與 0.5 比較，轉成布林值 True/False 後，再換成浮點數
        output = (output > 0.5).float()

        # 傳回最後結果
        return output.item() # 張量轉回浮點數

In [None]:
# 做為預測範例的兩個測試用評論
sentence1 = "這家飯店的服務很好，房間也很乾淨，下次還會再來。"
sentence2 = "很糟糕！房間髒，早餐也難吃。下次不會再光臨。"

# 預測結果
sentiment1 = predict_sentiment(sentence1)
sentiment2 = predict_sentiment(sentence2)

# 印出結果
print(f'Sentence: {sentence1}, Sentiment: {sentiment1}')
print(f'Sentence: {sentence2}, Sentiment: {sentiment2}')