In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

# Ki·ªÉm tra GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# C√†i ƒë·∫∑t seed ƒë·ªÉ ƒë·∫£m b·∫£o k·∫øt qu·∫£ nh·∫•t qu√°n
torch.manual_seed(42)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [2]:
%pip install transformers==4.40.0


Note: you may need to restart the kernel to use updated packages.


# ƒê·ªçc v√† x·ª≠ l√Ω d·ªØ li·ªáu

In [3]:
print("Loading data...")
df = pd.read_csv('clean_news.csv')

print(f"Original data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# B·ªè c√°c d√≤ng thi·∫øu d·ªØ li·ªáu
df = df.dropna(subset=["clean_text", "label"])
print(f"After dropping NaN: {df.shape}")

# √âp ki·ªÉu v√† lo·∫°i b·ªè d√≤ng r·ªóng
df["clean_text"] = df["clean_text"].astype(str).str.strip()
df = df[df["clean_text"] != ""]
print(f"After removing empty text: {df.shape}")

# √âp ki·ªÉu nh√£n
df["label"] = df["label"].astype(int)

# Hi·ªÉn th·ªã th·ªëng k√™
print(f"Label distribution:\n{df['label'].value_counts()}")
print(f"Sample texts:")
for i in range(min(3, len(df))):
    print(f"Label {df.iloc[i]['label']}: {df.iloc[i]['clean_text'][:100]}...")

Loading data...
Original data shape: (8697, 10)
Columns: ['text', 'label', 'id', 'user_name', 'post_message', 'timestamp_post', 'num_like_post', 'num_comment_post', 'num_share_post', 'clean_text']
After dropping NaN: (4225, 10)
After removing empty text: (4225, 10)
Label distribution:
label
0    4225
Name: count, dtype: int64
Sample texts:
Label 0: B√°c_sƒ© TP HCM th·∫Øng gi·∫£i nhi·∫øp_·∫£nh th·∫ø_gi·ªõi ƒê·∫ßu th√°ng 7 b√°c_sƒ© Ho√†i_Anh h√°o_h·ª©c ch·ªù ƒë√≥n chi·∫øc c√∫p Nh...
Label 0: 11 c√°n_b·ªô ph·∫£i quay l·∫°i l√†m_vi·ªác sau n·ª≠a nƒÉm ngh·ªâ h∆∞u Ng√†y 276 l√£nh_ƒë·∫°o huy·ªán Qu·ª≥nh_L∆∞u cho bi·∫øt ƒë√£ ...
Label 0: Th√™m h∆°n 210000 ng∆∞·ªùi Nga ƒëƒÉng_k√Ω tham_chi·∫øn ·ªü Ukraine_ƒê√¢y l√† th√†nh_qu·∫£ ph·ªëi_h·ª£p c·ªßa t·∫•t_c·∫£ c√°c c∆°_q...


# T√°ch d·ªØ li·ªáu train/validation

In [4]:
texts = df["clean_text"].tolist()
labels = df["label"].tolist()

print(f"Total samples: {len(texts)}")
print(f"Unique labels: {set(labels)}")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"Train samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Train label distribution: {pd.Series(train_labels).value_counts().to_dict()}")
print(f"Validation label distribution: {pd.Series(val_labels).value_counts().to_dict()}")

Total samples: 4225
Unique labels: {0}
Train samples: 3380
Validation samples: 845
Train label distribution: {0: 3380}
Validation label distribution: {0: 845}


# Tokenization v·ªõi PhoBERT

In [5]:
print("Loading PhoBERT tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

print("Tokenizing training data...")
train_encodings = tokenizer(
    train_texts, 
    padding=True, 
    truncation=True, 
    max_length=256, 
    return_tensors="pt"
)

print("Tokenizing validation data...")
val_encodings = tokenizer(
    val_texts, 
    padding=True, 
    truncation=True, 
    max_length=256, 
    return_tensors="pt"
)

print(f"Training encodings shape: {train_encodings['input_ids'].shape}")
print(f"Validation encodings shape: {val_encodings['input_ids'].shape}")

Loading PhoBERT tokenizer...
Tokenizing training data...
Tokenizing validation data...
Training encodings shape: torch.Size([3380, 256])
Validation encodings shape: torch.Size([845, 256])


# T·∫°o Dataset

In [6]:

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        item["labels"] = label
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Test m·ªôt sample
sample = train_dataset[0]
print(f"Sample keys: {sample.keys()}")
print(f"Sample input_ids shape: {sample['input_ids'].shape}")
print(f"Sample label: {sample['labels']}")
print(f"Sample label dtype: {sample['labels'].dtype}")
print(f"Sample label shape: {sample['labels'].shape}")

Train dataset size: 3380
Validation dataset size: 845
Sample keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
Sample input_ids shape: torch.Size([256])
Sample label: 0
Sample label dtype: torch.int64
Sample label shape: torch.Size([])


# T·∫°o v√† c·∫•u h√¨nh m√¥ h√¨nh

In [7]:
num_labels = len(set(labels))
print(f"Number of labels: {num_labels}")
print(f"Unique labels: {sorted(set(labels))}")

print("Loading PhoBERT model...")
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/phobert-base", 
    num_labels=num_labels,
    problem_type="single_label_classification"
)


print(f"Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

print("Testing forward pass...")
test_batch = next(iter(train_dataset))
test_batch = {k: v.unsqueeze(0).to(device) for k, v in test_batch.items()}
with torch.no_grad():
    outputs = model(**test_batch)
    print(f"Output shape: {outputs.logits.shape}")
    print(f"Expected shape: [1, {num_labels}]")

Number of labels: 1
Unique labels: [0]
Loading PhoBERT model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Model parameters: 134,999,041
Testing forward pass...
Output shape: torch.Size([1, 1])
Expected shape: [1, 1]


# C·∫•u h√¨nh training arguments

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    fp16=False,
    dataloader_num_workers=0,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
)

print("‚úÖ TrainingArguments ƒë√£ ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng!")


‚úÖ TrainingArguments ƒë√£ ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng!


In [9]:

import transformers
print(transformers.__version__)


4.40.0


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
import torch

# Load l·∫°i tokenizer v√† model t·ª´ th∆∞ m·ª•c ƒë√£ l∆∞u
model_dir = "D:/Kaggle/.ssh/news_project/saved_model/"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# Thi·∫øt b·ªã hu·∫•n luy·ªán (GPU n·∫øu c√≥, kh√¥ng th√¨ CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# N·∫øu b·∫°n mu·ªën ti·∫øp t·ª•c train, th√¨ c·∫•u h√¨nh training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    fp16=False,
    dataloader_num_workers=0,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
)

# N·∫øu ch∆∞a t·∫°o eval_dataset th√¨ t·∫°o t·∫°i ƒë√¢y
eval_dataset = NewsDataset(val_encodings, val_labels)

# T·∫°o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Hu·∫•n luy·ªán ti·∫øp t·ª•c n·∫øu mu·ªën
print("=== Ti·∫øp t·ª•c hu·∫•n luy·ªán b·∫±ng Trainer ===")
try:
    trainer.train()

except KeyboardInterrupt:
    print("‚õî ƒê√£ d·ª´ng gi·ªØa ch·ª´ng. L∆∞u model t·∫°m th·ªùi...")
    model.save_pretrained("saved_model_partial/")
    print("‚úÖ Hu·∫•n luy·ªán ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°m th·ªùi!")

except Exception as e:
    print(f"‚ùå L·ªói Trainer: {e}")
    print("‚è≥ ƒêang chuy·ªÉn sang hu·∫•n luy·ªán th·ªß c√¥ng...")

    from torch.utils.data import DataLoader
    from torch.optim import AdamW
    from tqdm import tqdm

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

    for epoch in range(3):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/3"):
            batch = {k: v.to(device) for k, v in batch.items()}
            batch["labels"] = batch["labels"].long()

            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"üîÅ Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")

# ‚úÖ L∆∞u m√¥ h√¨nh cu·ªëi c√πng
model.save_pretrained("saved_model/")
tokenizer.save_pretrained("saved_model/")
print("üíæ M√¥ h√¨nh ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i 'saved_model/'")


In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load l·∫°i t·ª´ th∆∞ m·ª•c ƒë√£ l∆∞u
model = AutoModelForSequenceClassification.from_pretrained("saved_model/")
tokenizer = AutoTokenizer.from_pretrained("saved_model/")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./saved_model"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model (ch√∫ √Ω num_labels = 2 n·∫øu l√† b√†i to√°n ph√¢n lo·∫°i nh·ªã ph√¢n)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)


In [None]:
for batch in train_dataloader:
    input_ids = batch['input_ids']
    print("Max input_id:", input_ids.max())
    print("Vocab size:", tokenizer.vocab_size)
    break


# T·∫°o Trainer v√† hu·∫•n luy·ªán

In [12]:
# Cell 8: Training v·ªõi progress tracking cho CPU
import time
from tqdm import tqdm
import os
import torch

print("Starting CPU-optimized training...")
print(f"Training on {len(train_dataset)} samples")
print(f"Validation on {len(val_dataset)} samples")

# ∆Ø·ªõc t√≠nh th·ªùi gian
estimated_time_per_epoch = len(train_dataset) / 4 * 0.1  # Gi·∫£ s·ª≠ 0.1s per batch
total_estimated_time = estimated_time_per_epoch * 3 / 60
print(f"Estimated training time: {total_estimated_time:.1f} minutes")

# Training v·ªõi Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

start_time = time.time()
print("Starting training...")

try:
    trainer.train()
    training_time = (time.time() - start_time) / 60
    print(f"Training completed in {training_time:.1f} minutes!")
    
    # ‚úÖ L∆∞u m√¥ h√¨nh sau khi train th√†nh c√¥ng
    trainer.save_model("models/final_model")
    tokenizer.save_pretrained("models/final_model")
    print("Model saved to models/final_model")

except Exception as e:
    print(f"Training error: {e}")
    print("Falling back to manual training...")

    # Manual training fallback
    from torch.utils.data import DataLoader
    from torch.optim import AdamW

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

    for epoch in range(3):
        model.train()
        total_loss = 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/3")

        for batch_idx, batch in enumerate(progress_bar):
            batch = {k: v.to(device) for k, v in batch.items()}
            batch['labels'] = batch['labels'].long()

            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Update progress bar
            progress_bar.set_postfix({
                'Loss': f'{loss.item():.4f}',
                'Avg Loss': f'{total_loss/(batch_idx+1):.4f}'
            })

        print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}")
    
    # ‚úÖ L∆∞u m√¥ h√¨nh sau hu·∫•n luy·ªán th·ªß c√¥ng
    model_save_path = "models/manual_trained_model"
    os.makedirs(model_save_path, exist_ok=True)
    torch.save(model.state_dict(), f"{model_save_path}/pytorch_model.bin")
    tokenizer.save_pretrained(model_save_path)
    print(f"Manual training model saved to {model_save_path}")


Starting CPU-optimized training...
Training on 3380 samples
Validation on 845 samples
Estimated training time: 4.2 minutes
Starting training...


  8%|‚ñä         | 50/633 [12:59<2:36:56, 16.15s/it]

{'loss': 0.415, 'grad_norm': 1.2937830686569214, 'learning_rate': 5e-06, 'epoch': 0.24}


 16%|‚ñà‚ñå        | 100/633 [24:13<1:55:28, 13.00s/it]

{'loss': 0.0285, 'grad_norm': 0.13045376539230347, 'learning_rate': 1e-05, 'epoch': 0.47}


 24%|‚ñà‚ñà‚ñé       | 150/633 [35:02<1:42:11, 12.70s/it]

{'loss': 0.0062, 'grad_norm': 0.07929304242134094, 'learning_rate': 9.061913696060039e-06, 'epoch': 0.71}


 32%|‚ñà‚ñà‚ñà‚ñè      | 200/633 [45:54<1:35:56, 13.29s/it]

{'loss': 0.0038, 'grad_norm': 0.05073023587465286, 'learning_rate': 8.123827392120077e-06, 'epoch': 0.95}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 250/633 [56:33<1:18:27, 12.29s/it]

{'loss': 0.0028, 'grad_norm': 0.03435263782739639, 'learning_rate': 7.185741088180113e-06, 'epoch': 1.18}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 300/633 [1:06:49<1:08:25, 12.33s/it]

{'loss': 0.0022, 'grad_norm': 0.02988443896174431, 'learning_rate': 6.2476547842401506e-06, 'epoch': 1.42}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 350/633 [1:17:06<57:45, 12.25s/it]  

{'loss': 0.0019, 'grad_norm': 0.0271089356392622, 'learning_rate': 5.309568480300188e-06, 'epoch': 1.66}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 400/633 [1:27:19<47:39, 12.27s/it]

{'loss': 0.0016, 'grad_norm': 0.023642314597964287, 'learning_rate': 4.3714821763602255e-06, 'epoch': 1.89}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 450/633 [1:37:30<37:11, 12.19s/it]

{'loss': 0.0014, 'grad_norm': 0.020636925473809242, 'learning_rate': 3.4333958724202633e-06, 'epoch': 2.13}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 500/633 [1:47:42<27:21, 12.34s/it]

{'loss': 0.0013, 'grad_norm': 0.019087564200162888, 'learning_rate': 2.4953095684803003e-06, 'epoch': 2.37}


                                                   
 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 500/633 [1:50:23<27:21, 12.34s/it]

{'eval_loss': 0.0008740455377846956, 'eval_runtime': 161.2582, 'eval_samples_per_second': 5.24, 'eval_steps_per_second': 1.315, 'epoch': 2.37}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 550/633 [2:00:41<16:54, 12.23s/it]  

{'loss': 0.0012, 'grad_norm': 0.018626516684889793, 'learning_rate': 1.557223264540338e-06, 'epoch': 2.6}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 600/633 [2:10:52<06:42, 12.19s/it]

{'loss': 0.0012, 'grad_norm': 0.018596813082695007, 'learning_rate': 6.191369606003752e-07, 'epoch': 2.84}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 633/633 [2:17:37<00:00, 13.05s/it]

{'train_runtime': 8257.6918, 'train_samples_per_second': 1.228, 'train_steps_per_second': 0.077, 'train_loss': 0.036960163028648865, 'epoch': 3.0}
Training completed in 137.6 minutes!





In [13]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
print("Vocab size:", tokenizer.vocab_size)

# Ki·ªÉm tra xem c√≥ token n√†o v∆∞·ª£t qu√° vocab kh√¥ng
for batch in train_dataset:
    if max(batch["input_ids"]) >= tokenizer.vocab_size:
        print("Found invalid token id:", max(batch["input_ids"]))


Vocab size: 64000


In [None]:
for batch in train_dataset:
    input_ids = batch["input_ids"]
    if input_ids.max() >= model.config.vocab_size:
        print("C√≥ token v∆∞·ª£t qu√° vocab_size!")
        break


# ƒê√°nh gi√° m√¥ h√¨nh

In [None]:
print(type(trainer))  # K·∫øt qu·∫£ ph·∫£i l√† <class 'transformers.trainer.Trainer'>


In [14]:
print("Evaluating model...")

# ƒê√°nh gi√°
eval_results = trainer.evaluate()
print(eval_results)


# D·ª± ƒëo√°n
print("Making predictions...")
predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# T√≠nh metrics
accuracy = accuracy_score(val_labels, pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(val_labels, pred_labels))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(val_labels, pred_labels)
print(cm)

# L∆∞u model
print("\nSaving model...")
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")
print("Model saved to ./best_model/")

Evaluating model...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 212/212 [02:40<00:00,  1.32it/s]


{'eval_loss': 0.0008740455377846956, 'eval_runtime': 161.6071, 'eval_samples_per_second': 5.229, 'eval_steps_per_second': 1.312, 'epoch': 2.996449704142012}
Making predictions...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 212/212 [02:38<00:00,  1.33it/s]


Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       845

    accuracy                           1.00       845
   macro avg       1.00      1.00      1.00       845
weighted avg       1.00      1.00      1.00       845


Confusion Matrix:
[[845]]

Saving model...
Model saved to ./best_model/


In [15]:

print("Saving model...")
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")
print("Model saved to ./best_model/")

# Test loading model
print("Testing model loading...")
loaded_model = AutoModelForSequenceClassification.from_pretrained("./best_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./best_model")
print("Model loaded successfully!")

Saving model...
Model saved to ./best_model/
Testing model loading...
Model loaded successfully!


In [16]:
# Cell 10: Test model ƒë√£ train
print("Testing saved model...")

# Load model ƒë√£ train
loaded_model = AutoModelForSequenceClassification.from_pretrained("./best_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./best_model")

# Test v·ªõi m·ªôt sample
if len(val_texts) > 0:
    test_text = val_texts[0]
    print(f"Test text: {test_text[:100]}...")
    
    # Tokenize
    inputs = loaded_tokenizer(
        test_text, 
        padding=True, 
        truncation=True, 
        max_length=256, 
        return_tensors="pt"
    )
    
    # Predict
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_label = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_label].item()
    
    print(f"Predicted label: {predicted_label}")
    print(f"Confidence: {confidence:.4f}")
    print(f"True label: {val_labels[0]}")
    print(f"Prediction correct: {predicted_label == val_labels[0]}")

Testing saved model...
Test text: Al Nassr l·∫ßn th·ª© t∆∞ thay HLV t·ª´ khi c√≥ Ronaldo_Al Nassr th√¥ng_b√°o Pioli v√† ƒë·ªôi_ng≈© tr·ª£_l√Ω c·ªßa √¥ng s·∫Ω...
Predicted label: 0
Confidence: 0.9992
True label: 0
Prediction correct: True


In [17]:
def predict_news(text, model, tokenizer, device):
    """
    D·ª± ƒëo√°n nh√£n cho m·ªôt ƒëo·∫°n text
    """
    model.eval()
    
    # Tokenize
    inputs = tokenizer(
        text, 
        padding=True, 
        truncation=True, 
        max_length=256, 
        return_tensors="pt"
    )
    
    # Chuy·ªÉn l√™n device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # D·ª± ƒëo√°n
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_label = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_label].item()
    
    return predicted_label, confidence

# Test h√†m d·ª± ƒëo√°n
test_texts = [
    "Tin t·ª©c m·ªõi nh·∫•t v·ªÅ t√¨nh h√¨nh kinh t·∫ø Vi·ªát Nam",
    "C·∫≠p nh·∫≠t v·ªÅ d·ªãch b·ªánh COVID-19 t·∫°i H√† N·ªôi",
    "K·∫øt qu·∫£ tr·∫≠n ƒë·∫•u b√≥ng ƒë√° t·ªëi qua"
]

print("Testing prediction function:")
for i, text in enumerate(test_texts):
    predicted_label, confidence = predict_news(text, loaded_model, loaded_tokenizer, device)
    print(f"Text {i+1}: {text[:50]}...")
    print(f"  Predicted: {predicted_label}, Confidence: {confidence:.4f}")
    print()

Testing prediction function:
Text 1: Tin t·ª©c m·ªõi nh·∫•t v·ªÅ t√¨nh h√¨nh kinh t·∫ø Vi·ªát Nam...
  Predicted: 0, Confidence: 0.9982

Text 2: C·∫≠p nh·∫≠t v·ªÅ d·ªãch b·ªánh COVID-19 t·∫°i H√† N·ªôi...
  Predicted: 0, Confidence: 0.9986

Text 3: K·∫øt qu·∫£ tr·∫≠n ƒë·∫•u b√≥ng ƒë√° t·ªëi qua...
  Predicted: 0, Confidence: 0.9983

