### Data Preprocessing

In [72]:
import pandas as pd

# 讀取資料
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 將標題和簡短描述結合成一個新的欄位
data['text'] = data['headline'] + ' ' + data['short_description']
test_data['text'] = test_data['headline'] + ' ' + test_data['short_description']
# 檢視前五筆資料
# print(data.head())

In [73]:
import numpy as np
#有些資料沒有short_description
print(data['text'].apply(type).value_counts())
# 找到值為浮點數的索引
nan_indices = np.where(data['text'].apply(type) == float)[0]
print(nan_indices)

#將 data['text'] 中的所有非字串類型都轉換為空字串
data['text'] = data['text'].fillna('')    #空值的位置填充為空字符串
test_data['text'] = test_data['text'].fillna('')

<class 'str'>      1982
<class 'float'>      18
Name: text, dtype: int64
[ 867 1616 1617 1618 1619 1631 1656 1657 1658 1659 1662 1670 1671 1677
 1678 1681 1682 1683]


In [74]:
# 使用NLTK將文本轉換為整數列表
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# nltk.download('punkt') #NLTK中提供的句子分割器和單詞分割器
# nltk.download('stopwords') #下載停用詞

# 獲取英文停用詞集合
stop_words = set(stopwords.words('english'))

# 將文本轉換為小寫並去除停用詞
def preprocess_text(text):
    # 分詞
    tokens = word_tokenize(text)
    # 轉換為小寫並去除停用詞
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return tokens

# 對數據進行處理
data['text_tokenized'] = data['text'].apply(preprocess_text)
test_data['text_tokenized'] = test_data['text'].apply(preprocess_text)

print(data.iloc[0]['text_tokenized'])

['jets', 'chairman', 'christopher', 'johnson', 'wo', "n't", 'fine', 'players', 'anthem', 'protests', '“', 'never', 'want', 'put', 'restrictions', 'speech', 'players', "''"]


In [75]:
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, random_split

# 定義一個函數，將文本列表轉換為整數列表
def yield_tokens(data_list):
    for tokens in data_list:
        yield tokens

# 創建一個詞彙表(build_vocab_from_iterator預期接收一個可迭代的tokens序列)
vocab = build_vocab_from_iterator(yield_tokens(data['text_tokenized']), specials=["<unk>", "<pad>"]) 
# vocab.set_default_index(vocab["<unk>"])

#將單詞轉換為整數
def text_transform(tokenized_text):
    return torch.tensor([vocab[token] for token in tokenized_text])

# Pad sequences to maximum length and stack
padded_text = pad_sequence(list(map(text_transform, data['text_tokenized'].values)), batch_first=True)
data_set = torch.utils.data.TensorDataset(padded_text, torch.tensor(data['category'].values))
    
# 將資料集分割為訓練集和驗證集
train_size = int(0.8 * len(data_set))
val_size = len(data_set) - train_size
train_dataset, val_dataset = random_split(data_set, [train_size, val_size])

# 創建訓練集和驗證集的 DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [76]:
# 將測試數據文本轉換為整數序列
test_data['text_transformed'] = test_data['text_tokenized'].apply(text_transform)

# 將整數序列的文本進行填充
padded_test_text = pad_sequence(list(test_data['text_transformed'].values), batch_first=True)

# 創建測試集的 DataLoader
test_labels = torch.zeros(padded_test_text.shape[0], dtype=torch.int64)
test_dataset = torch.utils.data.TensorDataset(padded_test_text, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=None)

1. How do you choose the tokenizer for this task? Could we use the white space to tokenize the text? What about using the complicated tokenizer instead? (5%)

NLTK（Natural Language Toolkit）是一個廣泛使用的Python自然語言處理庫，具有豐富的功能和工具。選擇NLTK作為分詞器的原因是因為它是一個成熟的自然語言處理庫，它提供了各種文本處理任務所需的功能，包括分詞、詞性標註、句法分析等。
是的，我們可以使用空格來對文本進行分詞。空格分詞是一種簡單且直接的分詞方法，特別適用於某些語言和特定的文本數據集。
使用複雜的分詞器相對於簡單的空格分詞器來說，可以提供更高級的分詞功能和更準確的分詞結果。使用複雜的分詞器可能有以下優點：處理複雜文本結構、支持多語言處理、提供上下文信息等。

2. Why we need the special tokens like ⟨pad⟩, ⟨unk⟩? (2%)


Padding Token (⟨pad⟩): 用於將序列填充到固定長度，確保輸入大小一致，以便在神經網絡中進行高效的處理。
Unknown Token (⟨unk⟩)： 當遇到未知詞時，它會被替換為 ⟨unk⟩ 標記，從而使模型能夠處理未見過的詞並更好地泛化到新的或未見過的數據。

3. Briefly explain how your procedure is run to handle the text data. (3%) (e.g. Which
tokenizer do you choose? Why? What is your min count setting? etc.)

先將headline以及short_description的資料合併，接著利用NLTK作為tokenizer將句子切成token的型態，接著在建立詞彙表，將token轉換為整數。


### Transformer

In [77]:
import torch
import torch.nn as nn
import math

# Transformer 模型
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_heads, hidden_size, num_layers, num_classes, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.positional_encoding = PositionalEncoding(embedding_size)
        self.transformer = nn.Transformer(
            d_model=embedding_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_size,
            dropout=dropout
        )
        self.fc = nn.Linear(embedding_size, num_classes)
        self.softmax = nn.Softmax(dim=1)  # 加上 softmax 函數

    def forward(self, src):
        # src shape: (batch_size, seq_len)
        src = self.embedding(src) # shape: (batch_size, seq_len, embedding_size)
        src = self.positional_encoding(src) # 添加 positional encoding
        src = src.transpose(0, 1) # shape: (seq_len, batch_size, embedding_size)
        output = self.transformer(src, src) # shape: (seq_len, batch_size, embedding_size)
        output = output.transpose(0, 1) # shape: (batch_size, seq_len, embedding_size)
        output = self.fc(output.mean(dim=1)) # shape: (batch_size, num_classes)
        output = self.softmax(output)  # 加上 softmax 函數
        return output

# 模型參數
vocab_size = len(vocab)
embedding_size = 200
num_heads = 2
hidden_size = 200
num_layers = 2
num_classes = len(set(data['category'].values))
dropout = 0.2

# 模型、損失函數、優化器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Positional Encoding 模塊
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(0.1)
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        seq_len = x.size(1)
        pe = self.pe[:, :seq_len, :]
        x = x + pe
        x = self.dropout(x)
        return x


model = TransformerModel(vocab_size, embedding_size, num_heads, hidden_size, num_layers, num_classes, dropout).to(device)
criterion = nn.CrossEntropyLoss()

lr = 0.01 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) #每經過1epoch，lr*0.95
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

In [78]:
from torchtext.vocab import GloVe

# 下載並加載GloVe預訓練的詞向量模型
glove = GloVe(name='6B', dim=embedding_size)

# 創建一個與詞彙表大小和嵌入維度相匹配的嵌入矩陣
embedding_matrix = torch.zeros(vocab_size, embedding_size)

# 將GloVe詞向量加載到嵌入矩陣中
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove.vectors[glove.stoi[token]]

# 將嵌入矩陣加載到模型的嵌入層中
model.embedding.weight.data.copy_(embedding_matrix)

# 將嵌入層的requires_grad屬性設置為False，以固定詞向量
model.embedding.weight.requires_grad = False

### 訓練

In [79]:
from tqdm import tqdm

num_epochs = 20
best_val_loss = float('inf')  # 初始設定為無窮大
best_model_weights = None

for epoch in range(num_epochs):
    # 訓練
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), (labels-1).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

        progress_bar.set_postfix({"Train Loss": running_loss / total_train, "Train Acc": correct_train / total_train})

    epoch_loss = running_loss / len(train_dataset)
    train_acc = correct_train / total_train
    progress_bar.close()
    
    # 驗證
    model.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), (labels-1).to(device)
            outputs = model(inputs)
            val_loss = criterion(outputs, labels)
            running_val_loss += val_loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_loss = running_val_loss / len(val_dataset)
    val_acc = correct_val / total_val

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
          
    # 保存驗證損失最低的模型參數
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_weights = model.state_dict()

# 儲存最佳模型参数
torch.save(best_model_weights, "model.pth")
# torch.save(model.state_dict(), "model.pth")

                                                                                             

Epoch [1/20], Train Loss: 1.2018, Train Acc: 0.5831, Val Loss: 0.9577, Val Acc: 0.7850


                                                                                              

Epoch [2/20], Train Loss: 0.9216, Train Acc: 0.8250, Val Loss: 0.9172, Val Acc: 0.8275


                                                                                              

Epoch [3/20], Train Loss: 0.8895, Train Acc: 0.8569, Val Loss: 0.9281, Val Acc: 0.8125


                                                                                              

Epoch [4/20], Train Loss: 0.8535, Train Acc: 0.8969, Val Loss: 0.9005, Val Acc: 0.8400


                                                                                              

Epoch [5/20], Train Loss: 0.8427, Train Acc: 0.9044, Val Loss: 0.9205, Val Acc: 0.8300


                                                                                              

Epoch [6/20], Train Loss: 0.8351, Train Acc: 0.9119, Val Loss: 0.9079, Val Acc: 0.8300


                                                                                              

Epoch [7/20], Train Loss: 0.8221, Train Acc: 0.9231, Val Loss: 0.9076, Val Acc: 0.8325


                                                                                              

Epoch [8/20], Train Loss: 0.8130, Train Acc: 0.9325, Val Loss: 0.8955, Val Acc: 0.8525


                                                                                              

Epoch [9/20], Train Loss: 0.8069, Train Acc: 0.9387, Val Loss: 0.9025, Val Acc: 0.8375


                                                                                               

Epoch [10/20], Train Loss: 0.8031, Train Acc: 0.9406, Val Loss: 0.9035, Val Acc: 0.8425


                                                                                               

Epoch [11/20], Train Loss: 0.8039, Train Acc: 0.9406, Val Loss: 0.8965, Val Acc: 0.8500


                                                                                               

Epoch [12/20], Train Loss: 0.8055, Train Acc: 0.9400, Val Loss: 0.9014, Val Acc: 0.8425


                                                                                               

Epoch [13/20], Train Loss: 0.8018, Train Acc: 0.9437, Val Loss: 0.9025, Val Acc: 0.8325


                                                                                               

Epoch [14/20], Train Loss: 0.7997, Train Acc: 0.9437, Val Loss: 0.9034, Val Acc: 0.8350


                                                                                               

Epoch [15/20], Train Loss: 0.7987, Train Acc: 0.9469, Val Loss: 0.9057, Val Acc: 0.8275


                                                                                               

Epoch [16/20], Train Loss: 0.7972, Train Acc: 0.9463, Val Loss: 0.9031, Val Acc: 0.8400


                                                                                               

Epoch [17/20], Train Loss: 0.7975, Train Acc: 0.9463, Val Loss: 0.9025, Val Acc: 0.8375


                                                                                               

Epoch [18/20], Train Loss: 0.7976, Train Acc: 0.9456, Val Loss: 0.9083, Val Acc: 0.8275


                                                                                               

Epoch [19/20], Train Loss: 0.7950, Train Acc: 0.9494, Val Loss: 0.9080, Val Acc: 0.8350


                                                                                               

Epoch [20/20], Train Loss: 0.7927, Train Acc: 0.9513, Val Loss: 0.9010, Val Acc: 0.8425


In [80]:
del model
model = TransformerModel(vocab_size, embedding_size, num_heads, hidden_size, num_layers, num_classes, dropout).to(device)
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [81]:
import csv

def test(dataloader, model, device):
    model.eval()                               
    y_hats = []
    for x, _ in dataloader:                         
        x = x.to(device)                       
        with torch.no_grad():                   
            y_hat = model(x)                     
            y_hats.append(y_hat.detach().cpu())
    y_hats = torch.cat(y_hats, dim=0)
    _, predicted = torch.max(y_hats, 1)  
    predicted = (predicted+1).numpy()     
    return predicted

def save_y_hat(y_hats, file):
    """ Save predictions to specified file """
    print("Saving results to {}".format(file))
    with open("submission.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["id", "category"])
        for i, y in enumerate(y_hats, start=1):
            writer.writerow([i, y])

In [82]:
y_hats = test(test_dataloader, model, device) 
save_y_hat(y_hats, "submission.csv") 

Saving results to submission.csv
