# 自然語言處理與文件探勘 HW2

# 工作分配

| 人員             | 任務                      | 佔比 |
| ---------------- | ------------------------- | ---- |
| ChatGPT          | 撰寫程式碼                | 1%   |
| 111590012 林品緯 | 複製貼上、debug、撰寫文件 | 99%  |

# 安裝必要模組

**gensim 和 numpy 可能會有版本問題，我的 gensim 為 (4.3.3) numpy 為 (1.24.0)，如果發生衝突，請自行解決。**

在開始之前，請先安裝以下必要的 Python 模組。你可以使用以下指令安裝：

In [1]:
! pip install gensim pandas scipy nltk scikit-learn




[notice] A new release of pip is available: 23.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# 載入數據

In [2]:
import pandas as pd
from pathlib import Path
import zipfile
import os

# 設定當前工作目錄
current_dir = Path().resolve()

# 解壓縮文件
zip_file_path = current_dir / "wordsim353.zip"
extract_to = current_dir / "wordsim353" 
os.makedirs(extract_to, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

# 讀取 CSV 文件
wordsim_df = pd.read_csv(extract_to / "combined.csv")
wordsim_df.columns = ['Word1', 'Word2', 'Similarity']

zip_file_path = current_dir / "BATS_3.0.zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(current_dir)

bats_dir = current_dir / "BATS_3.0"

wordsim_df.head()

Unnamed: 0,Word1,Word2,Similarity
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


# 使用 gensim 訓練 Word2Vec

In [3]:
from gensim.models import Word2Vec

from gensim.downloader import load

corpus = load("text8")

model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=5, workers=4)
model.save("word2vec_text8.model")


# WordSim-353 相似度評估

In [4]:
from scipy.stats import spearmanr

def get_similarity(w1, w2, model):
    try:
        return model.wv.similarity(w1.lower(), w2.lower())
    except KeyError:
        return None

pred_scores = []
true_scores = []

for _, row in wordsim_df.iterrows():
    sim = get_similarity(row['Word1'], row['Word2'], model)
    if sim is not None:
        pred_scores.append(sim)
        true_scores.append(row['Similarity'])

correlation, _ = spearmanr(pred_scores, true_scores)
print("Spearman correlation:", correlation)

Spearman correlation: 0.6249317685509049


# BATS 類比預測

In [5]:
import os

model = Word2Vec.load("word2vec_text8.model")

def analogy_accuracy_pairwise(file_path, model):
    correct = 0
    total = 0
    pairs = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()
            if len(tokens) != 2:
                continue
            pairs.append(tokens)

    for i in range(len(pairs)):
        for j in range(len(pairs)):
            if i == j:
                continue
            a, b = pairs[i]
            c, d = pairs[j]
            try:
                predicted = model.wv.most_similar(positive=[b, c], negative=[a], topn=1)[0][0]
                if predicted == d:
                    correct += 1
                total += 1
            except KeyError:
                continue

    return correct, total


total_correct = 0
total_questions = 0

for subfolder in os.listdir(bats_dir):
    subfolder_path = os.path.join(bats_dir, subfolder)
    if os.path.isdir(subfolder_path):
        for file in os.listdir(subfolder_path):
            if file.endswith(".txt"):
                file_path = os.path.join(subfolder_path, file)
                c, t = analogy_accuracy_pairwise(file_path, model)
                total_correct += c
                total_questions += t
                print(f"{subfolder}/{file}: {c}/{t} = {c/t:.2%}" if t > 0 else f"{subfolder}/{file}: skipped")

print(f"\nOverall Accuracy: {total_correct}/{total_questions} = {total_correct/total_questions:.2%}")


1_Inflectional_morphology/I01 [noun - plural_reg].txt: 1514/2450 = 61.80%
1_Inflectional_morphology/I02 [noun - plural_irreg].txt: 931/2450 = 38.00%
1_Inflectional_morphology/I03 [adj - comparative].txt: 52/931 = 5.59%
1_Inflectional_morphology/I04 [adj - superlative].txt: 4/931 = 0.43%
1_Inflectional_morphology/I05 [verb_inf - 3pSg].txt: 1471/2450 = 60.04%
1_Inflectional_morphology/I06 [verb_inf - Ving].txt: 1164/2450 = 47.51%
1_Inflectional_morphology/I07 [verb_inf - Ved].txt: 930/2450 = 37.96%
1_Inflectional_morphology/I08 [verb_Ving - 3pSg].txt: 1050/2401 = 43.73%
1_Inflectional_morphology/I09 [verb_Ving - Ved].txt: 866/2450 = 35.35%
1_Inflectional_morphology/I10 [verb_3pSg - Ved].txt: 837/2450 = 34.16%
2_Derivational_morphology/D01 [noun+less_reg].txt: 0/784 = 0.00%
2_Derivational_morphology/D02 [un+adj_reg].txt: 181/2401 = 7.54%
2_Derivational_morphology/D03 [adj+ly_reg].txt: 168/2450 = 6.86%
2_Derivational_morphology/D04 [over+adj_reg].txt: skipped
2_Derivational_morphology/D05 

# 和 TF-IDF + SVD 比較

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

new_corpus = [" ".join(doc) for doc in corpus]

# TF-IDF 建模
vectorizer = TfidfVectorizer(min_df=5)
tfidf_matrix = vectorizer.fit_transform(new_corpus)

# SVD 降維
svd = TruncatedSVD(n_components=100)
reduced = svd.fit_transform(tfidf_matrix)

# 使用 TF-IDF.T × SVD 結果取得詞向量
word_embeddings = tfidf_matrix.T @ reduced  # shape: (num_terms, 100)
index_to_word = {v: k for k, v in vectorizer.vocabulary_.items()}
word_vectors = {
    index_to_word[i]: word_embeddings[i]
    for i in range(word_embeddings.shape[0])
}

actual, predicted = [], []

for _, row in wordsim_df.iterrows():
    w1, w2, score = row["Word1"], row["Word2"], row["Similarity"]
    if w1 in word_vectors and w2 in word_vectors:
        sim = cosine_similarity(
            [word_vectors[w1]], [word_vectors[w2]]
        )[0][0]
        actual.append(score)
        predicted.append(sim)
    
corr, _ = spearmanr(actual, predicted)
print(f"[TF-IDF + SVD from text8] Spearman correlation: {corr:.4f}")

[TF-IDF + SVD from text8] Spearman correlation: 0.0236


# 將詞向量應用於其他任務 (HW1 的情感分類)

## 資料載入和預處理

In [7]:
import pandas as pd
import zipfile

# 解壓縮文件
zip_file_path = current_dir / "Sentiment-Analysis-Dataset.zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(current_dir)

# 讀取 CSV 文件
file_path = current_dir / "Sentiment Analysis Dataset.csv"
df = pd.read_csv(file_path, encoding="UTF-8-SIG", on_bad_lines='skip')
df = df[["Sentiment", "SentimentText"]].dropna()

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("stopwords")

# 停用詞列表
stop_words = set(stopwords.words('english'))


def clean_text(text):
    text = text.lower()  # 轉小寫
    text = re.sub(r"http\S+|www\S+|https\S+", "",
                  text, flags=re.MULTILINE)  # 移除 URL
    text = re.sub(r"@\w+|\#", "", text)  # 移除 @標記 和 #標籤
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # 移除非字母字符
    tokens = word_tokenize(text)    
    tokens = [word for word in tokens if word not in stop_words]  # 去除停用詞
    return " ".join(tokens)


df["CleanText"] = df["SentimentText"].apply(clean_text)

# 過濾長度過短的句子並抽樣 50000 筆
df = df[df["CleanText"].str.len() > 2].sample(n=50000, random_state=42)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

w2v_model = Word2Vec.load("word2vec_text8.model")
# 將句子轉為平均詞向量
def average_vector(tokens):
    vecs = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if vecs:
        return np.mean(vecs, axis=0)
    return np.zeros(w2v_model.vector_size)

X = np.array([average_vector(tokens) for tokens in df["CleanText"]])
y = df["Sentiment"].values

# 分割資料並訓練分類器
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 顯示分類結果
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.56      0.58      0.57      5069
    Positive       0.55      0.54      0.55      4931

    accuracy                           0.56     10000
   macro avg       0.56      0.56      0.56     10000
weighted avg       0.56      0.56      0.56     10000

