# 1.Import libraries

In [1]:
# import libraries and dataset
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


# 2.Preprocessing


### 2.1  Load dataset

In [3]:
data= fetch_20newsgroups(subset='all',  shuffle=True, random_state=46)

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]
data_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    shuffle=True,
    random_state=42

)
data_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42

)
type(data_train)


# print(data.train)
# print(data.target_names[0])

In [4]:
print(data_train.keys()) # check attribute

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [5]:
print(data_train.target_names)

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']


In [6]:
print(data_train.data[2])

From: Mark.Perew@p201.f208.n103.z1.fidonet.org
Subject: Re: Comet in Temporary Orbit Around Jupiter?
X-Sender: newtout 0.08 Feb 23 1993
Lines: 15

In a message of <Apr 19 04:55>, jgarland@kean.ucs.mun.ca writes:

 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq.com (Mark Brader) 
 >writes:

MB>                                                             So the
MB> 1970 figure seems unlikely to actually be anything but a perijove.

JG>Sorry, _perijoves_...I'm not used to talking this language.

Couldn't we just say periapsis or apoapsis?

 

--- msged 2.07



In [8]:
print(data_train.target)

[1 3 2 ... 1 0 1]


In [None]:
print(type(data_train))  # check type

<class 'sklearn.utils._bunch.Bunch'>


### 2.2 text preporcessing

In [9]:
data_train_category = []
for i in data_train.target:
    data_train_category.append(data_train.target_names[i])
print(data_train_category[:5])


data_test_category=[]
for i in data_test.target:
    data_test_category.append(data_test.target_names[i])
print(data_test_category[:5])

['comp.graphics', 'talk.religion.misc', 'sci.space', 'alt.atheism', 'sci.space']
['sci.space', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics']


In [10]:
# turn it into a dataframe

data_df=pd.DataFrame(data.target,columns=['Category_No'])

data_train_df = pd.DataFrame(data_train.target, columns=["label"])
data_test_df = pd.DataFrame(data_test.target, columns=["label"])


# data_df['Category_No'].value_counts()    # 了解每個類別的數量
data_df['Text'] = data.data    # 將文字放入dataframe
data_train_df['Text']=data_train.data
data_test_df['Text'] = data_test.data


# data_df['Category_Name']=data_category  #將類別名稱放入dataframe

data_train_df["category"] = data_train_category
data_test_df["category"] = data_test_category

# data_train_df=data_train_df[:16000]
data_train_df.shape[1]  # the number of columns
data_train_df.shape[0] # the number of rows
# data_test_df.shape[0]
# data_df.shape[0]

# data_train_df['Text'].value_counts()  # counting the text

# print(data_train)


2034

In [54]:
# text preprocessing
class TextPreprocessor:
    def __init__(self):
        self.vocab = {"<PAD>": 0, "<UNK>": 1}
        self.word_to_idx = self.vocab
        self.idx_to_word = {0: "<PAD>", 1: "<UNK>"}

    def tokenize(self, text):
        return text.lower().split()

    # **** 關鍵改進1: min_freq=5 (從3改為5，大幅減少詞彙) ****
    def build_vocab(self, texts, min_freq=4):
        word_counts = Counter()
        for text in texts:
            words = self.tokenize(text)
            word_counts.update(words)

        idx = 2
        for word, count in word_counts.items():
            if count >= min_freq:
                self.vocab[word] = idx
                self.idx_to_word[idx] = word
                idx += 1

        print(f"Vocabulary size: {len(self.vocab)}")
        return self.vocab

    def text_to_sequence(self, text, max_len):
        words = self.tokenize(text)
        sequence = [self.vocab.get(word, 1) for word in words]
        if len(sequence) < max_len:
            sequence += [0] * (max_len - len(sequence))
        else:
            sequence = sequence[:max_len]
        return sequence


In [55]:
# Dataset using DataFrame
class Dataset(Dataset):
    def __init__(self, df, preprocessor, max_len=30):
        self.df = df
        self.preprocessor = preprocessor
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["Text"]
        label = row["label"]
        sequence = self.preprocessor.text_to_sequence(text, self.max_len)
        return torch.LongTensor(sequence), torch.LongTensor([label])


In [None]:
# TextCNN Model - 

class TextCNN(nn.Module):
    def __init__(
        self, vocab_size, embed_dim, num_filters, filter_sizes, num_classes, dropout=0.5
    ):
        super(TextCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

       
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, kernel_size=f, padding=f-1)
            for f in filter_sizes
        ])

       
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(num_filters)
            for _ in filter_sizes
        ])

        self.dropout = nn.Dropout(dropout)

      
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

        self.relu = nn.ReLU()

    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        embedded = self.dropout(embedded)
        embedded = embedded.transpose(1, 2)  # (batch, embed_dim, seq_len)

        # ****Conv1d + BatchNorm + ReLU ****
        conv_results = []
        for conv, bn in zip(self.convs, self.bns):
            c = conv(embedded)  # (batch, num_filters, seq_len)
            c = bn(c)  # Batch Norm
            c = self.relu(c)
            p = torch.max_pool1d(c, c.size(2)).squeeze(2)  # (batch, num_filters)
            conv_results.append(p)

  
        cat = torch.cat(conv_results, 1)  # (batch, num_filters * len(filter_sizes))
        cat = self.dropout(cat)

    
        out = self.fc(cat)

        return out




In [None]:
# train model

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, num_classes):
    model.train()
    best_val_acc = 0
    patience = 5
    patience_counter = 0

    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for batch_texts, batch_labels in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels.squeeze())

            loss.backward()

          
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels.squeeze()).sum().item()

        train_acc = 100 * correct / total
        train_loss = total_loss / len(train_loader)

        # Validation phase
        val_loss, val_acc = evaluate_model(model, val_loader, criterion)

        print(
            f"Epoch [{epoch+1}/{num_epochs}] "
            f"Train Loss: {train_loss:.4f} Train Acc: {train_acc:.2f}% | "
            f"Val Loss: {val_loss:.4f} Val Acc: {val_acc:.2f}%"
        )

    
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\n✓ early stoppping at Epoch {epoch+1}")
                model.load_state_dict(torch.load('best_model.pt'))
                break


def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_texts, batch_labels in dataloader:
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels.squeeze())
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels.squeeze()).sum().item()
    avg_loss = total_loss / len(dataloader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy


def predict_news(text, model, preprocessor, max_len=30):
    model.eval()
    with torch.no_grad():
        seq = preprocessor.text_to_sequence(text, max_len)
        x = torch.LongTensor(seq).unsqueeze(0)
        out = model(x)
        prob = torch.softmax(out, dim=1)
        cls = torch.argmax(prob, dim=1).item()
        return cls, prob[0][cls].item()




In [None]:

label_test = data_test.target
label_train = data_train.target
category_names_test = data_test.target_names
list_category = sorted(zip(label_test, category_names_test))
category_names = {i: category_names_test for i, (_, category_names_test) in enumerate(list_category)}

print("Building vocabulary from training data...")
preprocessor = TextPreprocessor()

vocab = preprocessor.build_vocab(data_train_df["Text"], min_freq=4)

# Create datasets
train_dataset = Dataset(data_train_df, preprocessor, max_len=30)
test_dataset = Dataset(data_test_df, preprocessor, max_len=30)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = TextCNN(len(vocab), embed_dim=100, num_filters=100, filter_sizes=[2, 3, 4], num_classes=4, dropout=0.5)

criterion = nn.CrossEntropyLoss()


optimizer = optim.Adam(model.parameters(), lr=0.00025, weight_decay=0.0001)

# Train model
print("\nTraining model...")
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=30, num_classes=4)

# Evaluate on test set
print("\n" + "=" * 60)
print("Final Evaluation on Test Set:")
test_loss, test_acc = evaluate_model(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} Test Accuracy: {test_acc:.2f}%")
print("=" * 60)



Building vocabulary from training data...
Vocabulary size: 14108

Training model...
Epoch [1/30] Train Loss: 2.0883 Train Acc: 27.53% | Val Loss: 1.3100 Val Acc: 40.80%
Epoch [2/30] Train Loss: 1.1569 Train Acc: 56.15% | Val Loss: 1.1187 Val Acc: 54.18%
Epoch [3/30] Train Loss: 0.8844 Train Acc: 72.91% | Val Loss: 0.9851 Val Acc: 63.86%
Epoch [4/30] Train Loss: 0.7055 Train Acc: 82.69% | Val Loss: 0.9117 Val Acc: 65.78%
Epoch [5/30] Train Loss: 0.5666 Train Acc: 88.00% | Val Loss: 0.8237 Val Acc: 70.88%
Epoch [6/30] Train Loss: 0.4590 Train Acc: 91.20% | Val Loss: 0.7555 Val Acc: 74.28%
Epoch [7/30] Train Loss: 0.3719 Train Acc: 94.20% | Val Loss: 0.7420 Val Acc: 72.88%
Epoch [8/30] Train Loss: 0.3012 Train Acc: 95.38% | Val Loss: 0.6975 Val Acc: 74.35%
Epoch [9/30] Train Loss: 0.2437 Train Acc: 96.85% | Val Loss: 0.6805 Val Acc: 74.87%
Epoch [10/30] Train Loss: 0.1976 Train Acc: 97.69% | Val Loss: 0.6198 Val Acc: 78.57%
Epoch [11/30] Train Loss: 0.1587 Train Acc: 98.72% | Val Loss: 0.