In [None]:
! pip install nltk numpy pandas pathlib scikit-learn sys zipfile

# 載入數據

In [2]:
import pandas as pd
from pathlib import Path
import zipfile

# 設定當前工作目錄
current_dir = Path().resolve()

# 解壓縮文件
zip_file_path = current_dir / "Sentiment-Analysis-Dataset.zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(current_dir)

# 讀取 CSV 文件
file_path = current_dir / "Sentiment Analysis Dataset.csv"
df = pd.read_csv(file_path, encoding="UTF-8-SIG", on_bad_lines='skip')

# 顯示前 5 條數據
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


# 文本預處理

In [3]:
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("stopwords")

# 停用詞列表
stop_words = set(stopwords.words('english'))


def clean_text(text):
    text = text.lower()  # 轉小寫
    text = re.sub(r"http\S+|www\S+|https\S+", "",
                  text, flags=re.MULTILINE)  # 移除 URL
    text = re.sub(r"@\w+|\#", "", text)  # 移除 @標記 和 #標籤
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # 移除非字母字符
    tokens = word_tokenize(text)    
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用詞
    return " ".join(tokens)


df["CleanText"] = df["SentimentText"].apply(clean_text)

df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,CleanText
0,1,0,Sentiment140,is so sad for my APL frie...,sad apl friend
1,2,0,Sentiment140,I missed the New Moon trail...,missed new moon trailer
2,3,1,Sentiment140,omg its already 7:30 :O,omg already
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cry ive dentist since ...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,think mi bf cheating tt


# 特徵提取

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)  # 只取前 5000 個關鍵詞
X = vectorizer.fit_transform(df["CleanText"])  # 轉換文本為向量
y = df["Sentiment"]  # 目標標籤

# 實作 Naïve Bayes

In [5]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.word_probs = {}  # 存放每個單詞的機率
        self.class_probs = {}  # 存放每個類別的機率

    def train(self, X, y):
        """ 訓練 Naïve Bayes 分類器 """
        # 計算先驗機率 P(Y)
        classes, class_counts = np.unique(y, return_counts=True)
        self.class_probs = {c: class_counts[i] / len(y) for i, c in enumerate(classes)}

        # 計算條件機率 P(X|Y)
        word_counts = {c: np.sum(X[y == c], axis=0) for c in classes}
        total_counts = {c: np.sum(word_counts[c]) for c in classes}

        # 使用拉普拉斯平滑
        vocab_size = X.shape[1]  # 詞彙表大小
        for c in classes:
            self.word_probs[c] = (word_counts[c] + 1) / (total_counts[c] + vocab_size)

    def predict(self, X):
        """ 根據 P(Y | X) 預測分類 """
        results = []
        for i in range(X.shape[0]):
            probs = {}
            for c in self.class_probs:
                probs[c] = np.log(self.class_probs[c]) + np.sum(np.log(self.word_probs[c]) * X[i].T)
            results.append(max(probs, key=probs.get))  # 選擇機率最大的類別
        return np.array(results)

# 訓練 & 測試

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化並訓練模型
nb = NaiveBayesClassifier()
nb.train(X_train, y_train)

# 預測
y_pred = nb.predict(X_test)

# 評估
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.7617468477114433
Precision: 0.760221168043731
Recall: 0.7655216920906278
F1 Score: 0.7628622228947568
