## Part I: Data Pre-processing

In [1]:
import pandas as pd

In [None]:
# Download the Google Analogy dataset
!wget http://download.tensorflow.org/data/questions-words.txt

In [3]:
# Preprocess the dataset
file_name = "questions-words"
with open(f"{file_name}.txt", "r") as f:
    data = f.read().splitlines()

In [None]:
# check data from the first 10 entries
for entry in data[:10]:
    print(entry)

In [None]:
# TODO1: Write your code here for processing data to pd.DataFrame
# Please note that the first five mentions of ": " indicate `semantic`,
# and the remaining nine belong to the `syntatic` category.
df = pd.DataFrame(columns=["Question", "Category", "SubCategory"])
dataArray = []

count = 0
for entry in data:
    if entry.startswith(": "):
        subcategory = entry
        count += 1
    else:
        category = "Semantic" if count <= 5 else "Syntactic"
        question = entry
        dataArray.append({"Question": question, "Category": category, "SubCategory": subcategory})

print(dataArray[:10])
        

In [6]:
df = pd.DataFrame(dataArray)

In [None]:
df.head()

In [8]:
df.to_csv(f"{file_name}.csv", index=False)

## Part II: Use pre-trained word embeddings
- After finish Part I, you can run Part II code blocks only.

In [9]:
import pandas as pd
import numpy as np
import gensim.downloader
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
data = pd.read_csv("questions-words.csv")
print(data.head())

In [None]:
MODEL_NAME = "glove-wiki-gigaword-100"
# You can try other models.
# https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models
print(list(gensim.downloader.info()['models'].keys()))

# Load the pre-trained model (using GloVe vectors here)
model = gensim.downloader.load(MODEL_NAME)
print("The Gensim model loaded successfully!")

In [12]:
def word_analogy(word_a, word_b, word_c, model):
    try:
        result = model.most_similar(positive=[word_b, word_c], negative=[word_a], topn=1)
        return result[0][0]
    except KeyError:
        return None

In [None]:
# Do predictions and preserve the gold answers (word_D)
preds = []
golds = []

for analogy in tqdm(data["Question"]):
    # TODO2: Write your code here to use pre-trained word embeddings for getting predictions of the analogy task.
    # You should also preserve the gold answers during iterations for evaluations later.
    """ Hints
    # Unpack the analogy (e.g., "man", "woman", "king", "queen")
    # Perform vector arithmetic: word_b + word_c - word_a should be close to word_d
    # Source: https://github.com/piskvorky/gensim/blob/develop/gensim/models/keyedvectors.py#L776
    # Mikolov et al., 2013: big - biggest and small - smallest
    # Mikolov et al., 2013: X = vector(”biggest”) − vector(”big”) + vector(”small”).
    """
    words = analogy.split()
    
    word_a, word_b, word_c, word_d = list(map(str.lower, words))
    pred = word_analogy(word_a, word_b, word_c, model)
    
    if pred is None:
        print(f"Skipping analogy due to OOV words: {analogy}")
        continue
    
    golds.append(word_d)
    preds.append(pred)
      

In [None]:
print(f"Predictions: {preds[:5]}")
print(f"Gold answers: {golds[:5]}")

In [None]:
# Perform evaluations. You do not need to modify this block!!

def calculate_accuracy(gold: np.ndarray, pred: np.ndarray) -> float:
    return np.mean(gold == pred)

golds_np, preds_np = np.array(golds), np.array(preds)
data = pd.read_csv("questions-words.csv")

# Evaluation: categories
for category in data["Category"].unique():
    mask = data["Category"] == category
    golds_cat, preds_cat = golds_np[mask], preds_np[mask]
    acc_cat = calculate_accuracy(golds_cat, preds_cat)
    print(f"Category: {category}, Accuracy: {acc_cat * 100}%")

# Evaluation: sub-categories
for sub_category in data["SubCategory"].unique():
    mask = data["SubCategory"] == sub_category
    golds_subcat, preds_subcat = golds_np[mask], preds_np[mask]
    acc_subcat = calculate_accuracy(golds_subcat, preds_subcat)
    print(f"Sub-Category{sub_category}, Accuracy: {acc_subcat * 100}%")

In [None]:
# Collect words from Google Analogy dataset
SUB_CATEGORY = ": family"

# TODO3: Plot t-SNE for the words in the SUB_CATEGORY `: family`
# Filter the data for the specific sub-category
family_data = data[data["SubCategory"] == SUB_CATEGORY]

# Collect unique words from the family category
family_words = set()
for question in family_data["Question"]:
    family_words.update(question.lower().split())

# Get word vectors for family words
word_vectors = []
words = []
for word in family_words:
    if word in model:
        word_vectors.append(model[word])
        words.append(word)

# Convert to numpy array
word_vectors = np.array(word_vectors)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
tsne_results = tsne.fit_transform(word_vectors)

# Create the plot
plt.figure(figsize=(12, 8))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.6)

# Annotate points with words
for i, word in enumerate(words):
    plt.annotate(word, (tsne_results[i, 0], tsne_results[i, 1]), fontsize=9)

plt.title("Word Relationships from Google Analogy Task: Family Category")
plt.xlabel("t-SNE dimension 1")
plt.ylabel("t-SNE dimension 2")
plt.tight_layout()

plt.title("Word Relationships from Google Analogy Task")
plt.show()
plt.savefig("word_relationships.png", bbox_inches="tight")

### Part III: Train your own word embeddings

### Get the latest English Wikipedia articles and do sampling.
- Usually, we start from Wikipedia dump (https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2). However, the downloading step will take very long. Also, the cleaning step for the Wikipedia corpus ([`gensim.corpora.wikicorpus.WikiCorpus`](https://radimrehurek.com/gensim/corpora/wikicorpus.html#gensim.corpora.wikicorpus.WikiCorpus)) will take much time. Therefore, we provide cleaned files for you.

In [None]:
!pip install gdown

In [None]:
# Download the split Wikipedia files
# Each file contain 562365 lines (articles).
!gdown --id 1J0os1846PQ129t720aI0wMm-5GepEwSl -O wiki_texts_part_0.txt.gz
!gdown --id 1tsI3RSKPN3b2-1IZ0N7bmjgVRf-THIkW -O wiki_texts_part_1.txt.gz
!gdown --id 1koiw6RFNzDe6pe2zMTfVhsEKmpmnYyu5 -O wiki_texts_part_2.txt.gz
!gdown --id 1YSGbDqhbg2xJsWD_hYQ5z9URl0dCTC2m -O wiki_texts_part_3.txt.gz
!gdown --id 1PA3C99C8CcLFjkenT0a9iU07XEQmXyG_ -O wiki_texts_part_4.txt.gz

In [None]:
# Download the split Wikipedia files
# Each file contain 562365 lines (articles), except the last file.
!gdown --id 1sSLea4hq6Z7oT6noOU_II1ahWjNOKcDX -O wiki_texts_part_5.txt.gz
!gdown --id 1i6kXTDtZkRiivJ0mj-5GkVbE4gMFlmSb -O wiki_texts_part_6.txt.gz
!gdown --id 1ain2DN1nxXfsmJ2Aj9TFZlLVJSPsu9Jb -O wiki_texts_part_7.txt.gz
!gdown --id 1UKhvielQDqQz5pMZ7J3SHv9m8_8gO-dE -O wiki_texts_part_8.txt.gz
!gdown --id 1q1zMA4hbMS7tID2GTQx-c94UPB8YQaaa -O wiki_texts_part_9.txt.gz
!gdown --id 1-kkGxwMxPsoGg5_2pdaOeE3Way6njLpH -O wiki_texts_part_10.txt.gz

In [10]:
# Extract the downloaded wiki_texts_parts files.
!gunzip -k wiki_texts_part_*.gz

In [11]:
# Combine the extracted wiki_texts_parts files.
!cat wiki_texts_part_*.txt > wiki_texts_combined.txt

In [None]:
# Check the first ten lines of the combined file
!head -n 10 wiki_texts_combined.txt

Please note that we used the default parameters of [`gensim.corpora.wikicorpus.WikiCorpus`](https://radimrehurek.com/gensim/corpora/wikicorpus.html#gensim.corpora.wikicorpus.WikiCorpus) for cleaning the Wiki raw file. Thus, words with one character were discarded.

In [13]:
# Now you need to do sampling because the corpus is too big.
# You can further perform analysis with a greater sampling ratio.

import random

wiki_txt_path = "wiki_texts_combined.txt"
output_path = "wiki_texts_sampled_20percent.txt"
sampling_ratio = 0.20

# wiki_texts_combined.txt is a text file separated by linebreaks (\n).
# Each row in wiki_texts_combined.txt indicates a Wikipedia article.

with open(wiki_txt_path, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)
    
lines_to_sample = int(total_lines * sampling_ratio)
lines_to_keep = set(random.sample(range(total_lines), lines_to_sample))

with open(wiki_txt_path, "r", encoding="utf-8") as f:
    with open(output_path, "w", encoding="utf-8") as output_file:
    # TODO4: Sample `20%` Wikipedia articles
        for i, line in enumerate(f):
            if i in lines_to_keep:
                output_file.write(line)
        

In [None]:
# TODO5: Train your own word embeddings with the sampled articles
# https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# Hint: You should perform some pre-processing before training.

In [1]:
import gensim
import time
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

In [2]:
def calculate_accuracy(gold: np.ndarray, pred: np.ndarray) -> float:
    return np.mean(gold == pred)

def predict(model, data):
    preds = []
    golds = []

    for analogy in tqdm(data["Question"]):
        words = analogy.split()
        word_a, word_b, word_c, word_d = list(map(str.lower, words))
        pred = word_analogy(word_a, word_b, word_c, model)
        
        if pred is None:
            print(f"Skipping analogy due to OOV words: {analogy}")
            continue

        golds.append(word_d)
        preds.append(pred)
    
    return golds, preds

def analysis_score(model, data):
    golds, preds = predict(model, data)
    golds_np, preds_np = np.array(golds), np.array(preds)
    data = pd.read_csv("questions-words.csv")

    # Evaluation: categories
    for category in data["Category"].unique():
        mask = data["Category"] == category
        golds_cat, preds_cat = golds_np[mask], preds_np[mask]
        acc_cat = calculate_accuracy(golds_cat, preds_cat)
        print(f"Category: {category}, Accuracy: {acc_cat * 100}%")

    # Evaluation: sub-categories
    for sub_category in data["SubCategory"].unique():
        mask = data["SubCategory"] == sub_category
        golds_subcat, preds_subcat = golds_np[mask], preds_np[mask]
        acc_subcat = calculate_accuracy(golds_subcat, preds_subcat)
        print(f"Sub-Category{sub_category}, Accuracy: {acc_subcat * 100}%")
    
def plot_TSNE(data, model):
    # Collect words from Google Analogy dataset
    SUB_CATEGORY = ": family"

    # Filter the data for the specific sub-category
    family_data = data[data["SubCategory"] == SUB_CATEGORY]

    # Collect unique words from the family category
    family_words = set()
    for question in family_data["Question"]:
        family_words.update(question.lower().split())

    # Get word vectors for family words
    word_vectors = []
    words = []
    for word in family_words:
        if word in model.wv:
            word_vectors.append(model.wv[word])
            words.append(word)

    # Convert to numpy array
    word_vectors = np.array(word_vectors)

    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=5)
    tsne_results = tsne.fit_transform(word_vectors)

    # Create the plot
    plt.figure(figsize=(12, 8))
    plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.6)

    # Annotate points with words
    for i, word in enumerate(words):
        plt.annotate(word, (tsne_results[i, 0], tsne_results[i, 1]), fontsize=9)

    plt.tight_layout()
    plt.title("Word Relationships from Google Analogy Task")
    plt.show()
    plt.savefig("word_relationships.png", bbox_inches="tight")

In [None]:
# 讀取採樣後的文章
with open('wiki_texts_sampled_20percent.txt', 'r', encoding='utf-8') as f:
    articles = f.readlines()

# 預處理文章
processed_articles = []
for article in tqdm(articles, desc="Preprocessing articles"):
    processed_articles.append(simple_preprocess(article))

Preprocessing articles:   7%|▋         | 81945/1124731 [01:17<10:43, 1620.29it/s]

In [None]:
# 訓練Word2Vec模型
model = Word2Vec(sentences=processed_articles, 
                 vector_size=100,  # 詞向量的維度
                 window=5,  # 上下文窗口大小
                 min_count=5,  # 詞頻閾值，小於該值的詞會被忽略
                 workers=4)  # 使用的線程數

# 保存模型
model.save(f"word2vec_model-{time.time()}.model")

print("Model training completed and saved.")

In [17]:
data = pd.read_csv("questions-words.csv")

In [None]:
# Evaluation
analysis_score(model, data)

In [None]:
plot_TSNE(data, model)

In [None]:
# Addition, Larger Version of the Word Relationship Plot

# 訓練Word2Vec模型
model = Word2Vec(sentences=processed_articles, 
                 vector_size=300,  # 詞向量的維度
                 window=20,  # 上下文窗口大小
                 min_count=3,  # 詞頻閾值，小於該值的詞會被忽略
                 workers=20)  # 使用的線程數

# 保存模型
model_name = f'word2vec_model_{time.time()}.model'
model.save(model_name)

print(f"Model training completed and saved to {model_name}.")

# Compare the predictions with the gold answers
model = Word2Vec.load(model_name)
analysis_score(model, data)
plot_TSNE(data, model)

In [None]:
# Addition, Larger Larger Version of the Word Relationship Plot

# 訓練Word2Vec模型
model = Word2Vec(sentences=processed_articles, 
                 vector_size=500,  # 詞向量的維度
                 window=100,  # 上下文窗口大小
                 min_count=3,  # 詞頻閾值，小於該值的詞會被忽略
                 workers=8)  # 使用的線程數

# 保存模型
model_name = f'word2vec_model_{time.time()}.model'
model.save(model_name)

print(f"Model training completed and saved to {model_name}.")

# Compare the predictions with the gold answers
model = Word2Vec.load(model_name)
analysis_score(model, data)
plot_TSNE(data, model)