# *Vietnamese News Industry Classification with XLM-RoBERTa*
## This notebook fine-tunes XLM-RoBERTa for classifying the industry of Vietnamese news summaries.
* Dataset: 17002 samples

In [1]:
# Install required packages
!pip install -q transformers datasets evaluate accelerate scikit-learn pandas matplotlib seaborn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import shutil
import zipfile
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import logging

2025-07-24 14:51:54.131906: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753368714.415410      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753368714.500708      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Configuration
class Config:
    MODEL_NAME = "xlm-roberta-base"
    SEED = 42
    BATCH_SIZE = 8  # Reduced to avoid GPU memory issues
    GRADIENT_ACCUMULATION_STEPS = 2  # Adjusted for effective batch size of 16
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 10
    MAX_LENGTH = 256
    WEIGHT_DECAY = 0.01
    OUTPUT_DIR = "./xlm-roberta-industry-complete"
    LOGGING_STEPS = 10  # Increased frequency for better monitoring
    SAVE_TOTAL_LIMIT = 2
    INDUSTRY_MAP = {'Finance': 0, 'Technology': 1, 'Healthcare': 2, 'Energy': 3, 'Other': 4}
    REVERSE_INDUSTRY_MAP = {0: 'Finance', 1: 'Technology', 2: 'Healthcare', 3: 'Energy', 4: 'Other'}
    EARLY_STOPPING_PATIENCE = 3
    LR_SCHEDULER_TYPE = "cosine"
    WARMUP_RATIO = 0.1
    USE_CLASS_WEIGHTS = True
    DATA_PATH = "/kaggle/input/data-news-v2/data_news_v2.xlsx"

config = Config()

# Create output directory
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

In [4]:
# Set up logging with error handling
try:
    logging.basicConfig(
        filename=os.path.join(config.OUTPUT_DIR, 'training.log'),
        level=logging.DEBUG,  # Increased verbosity
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)
    logger.info("Logging initialized successfully")
except PermissionError:
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    logger.info("Fallback to console logging due to permission error for file: %s", os.path.join(config.OUTPUT_DIR, 'training.log'))
except Exception as e:
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    logger.info("Logging setup failed with error: %s. Fallback to console logging.", str(e))

# Set random seed
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)

In [5]:
# Custom Trainer with class weights
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        labels = labels.long()
        
        if self.class_weights is not None:
            weights = torch.tensor(self.class_weights, device=logits.device, dtype=torch.float32)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
            
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [6]:
# Load and explore dataset
def load_and_explore_data(file_path):
    logger.info("Loading dataset...")
    df = pd.read_excel(file_path)
    
    df['label'] = df['industry'].map(config.INDUSTRY_MAP)
    
    with open(os.path.join(config.OUTPUT_DIR, 'dataset_info.txt'), 'w') as f:
        f.write(f"Total samples: {len(df)}\n")
        f.write("\nIndustry distribution:\n")
        f.write(df['industry'].value_counts().to_string())
    
    plt.figure(figsize=(8, 5))
    class_dist = df['industry'].value_counts()
    sns.barplot(x=class_dist.index, y=class_dist.values)
    plt.title('Industry Distribution')
    plt.ylabel('Count')
    plt.savefig(os.path.join(config.OUTPUT_DIR, 'industry_distribution.png'))
    plt.close()
    
    df['text_length'] = df['summary'].apply(lambda x: len(x.split()))
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(df['text_length'], bins=30)
    plt.title('Text Length Distribution')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x='industry', y='text_length', data=df)
    plt.title('Text Length by Industry')
    plt.savefig(os.path.join(config.OUTPUT_DIR, 'text_length_distribution.png'))
    plt.close()
    
    return df

df = load_and_explore_data(config.DATA_PATH)
df = df.dropna()

if config.USE_CLASS_WEIGHTS:
    class_weights = compute_class_weight(
        'balanced', 
        classes=np.unique(df['label']),
        y=df['label']
    )
    config.CLASS_WEIGHTS = class_weights.tolist()
    logger.info(f"Class weights: {config.CLASS_WEIGHTS}")
else:
    config.CLASS_WEIGHTS = None

tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

  with pd.option_context('mode.use_inf_as_na', True):


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
# Analyze token lengths
def analyze_token_lengths(texts, tokenizer, max_length):
    lengths = []
    for text in texts:
        tokens = tokenizer(text, truncation=True, max_length=max_length)["input_ids"]
        lengths.append(len(tokens))
    return lengths

token_lengths = analyze_token_lengths(df['summary'], tokenizer, config.MAX_LENGTH)

plt.figure(figsize=(10, 5))
sns.histplot(token_lengths, bins=30)
plt.title('Token Length Distribution')
plt.axvline(x=config.MAX_LENGTH, color='r', linestyle='--', label='Max Length')
plt.legend()
plt.savefig(os.path.join(config.OUTPUT_DIR, 'token_length_distribution.png'))
plt.close()

logger.info(f"Percentage of texts within max length: {sum(np.array(token_lengths) <= config.MAX_LENGTH) / len(token_lengths):.2%}")

  with pd.option_context('mode.use_inf_as_na', True):


In [8]:
# Preprocess function
def preprocess_function(examples):
    return tokenizer(
        examples["summary"],
        truncation=True,
        max_length=config.MAX_LENGTH,
        padding="max_length"
    )

# Split data
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=config.SEED,
    stratify=df['label']
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=config.SEED,
    stratify=temp_df['label']
)

train_dataset = Dataset.from_pandas(train_df[['summary', 'industry', 'label']])
val_dataset = Dataset.from_pandas(val_df[['summary', 'industry', 'label']])
test_dataset = Dataset.from_pandas(test_df[['summary', 'industry', 'label']])

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["summary", "industry"]
)

with open(os.path.join(config.OUTPUT_DIR, 'data_splits.txt'), 'w') as f:
    f.write(f"Train samples: {len(train_df)}\n")
    f.write(f"Validation samples: {len(val_df)}\n")
    f.write(f"Test samples: {len(test_df)}\n")

model = AutoModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=5
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/7477 [00:00<?, ? examples/s]

Map:   0%|          | 0/935 [00:00<?, ? examples/s]

Map:   0%|          | 0/935 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    report = classification_report(
        labels,
        predictions,
        target_names=['Finance', 'Technology', 'Healthcare', 'Energy', 'Other'],
        output_dict=True
    )
    
    metrics = {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'finance_precision': report['Finance']['precision'],
        'finance_recall': report['Finance']['recall'],
        'finance_f1': report['Finance']['f1-score'],
        'technology_precision': report['Technology']['precision'],
        'technology_recall': report['Technology']['recall'],
        'technology_f1': report['Technology']['f1-score'],
        'healthcare_precision': report['Healthcare']['precision'],
        'healthcare_recall': report['Healthcare']['recall'],
        'healthcare_f1': report['Healthcare']['f1-score'],
        'energy_precision': report['Energy']['precision'],
        'energy_recall': report['Energy']['recall'],
        'energy_f1': report['Energy']['f1-score'],
        'other_precision': report['Other']['precision'],
        'other_recall': report['Other']['recall'],
        'other_f1': report['Other']['f1-score']
    }
    
    logger.info(f"Evaluation metrics: {metrics}")
    
    return metrics

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    run_name=f"xlm-roberta-industry-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=config.LOGGING_STEPS,
    save_steps=100,
    save_total_limit=config.SAVE_TOTAL_LIMIT,
    learning_rate=config.LEARNING_RATE,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=config.NUM_EPOCHS,
    weight_decay=config.WEIGHT_DECAY,
    lr_scheduler_type=config.LR_SCHEDULER_TYPE,
    warmup_ratio=config.WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    fp16=True,
    logging_dir="./logs",
    seed=config.SEED,
    report_to="none",  # Disable wandb logging
    log_level="debug"  # Increase logging verbosity
)

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=config.CLASS_WEIGHTS,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)]
)

  super().__init__(**kwargs)
Using auto half precision backend


In [11]:
# Check GPU memory before training
logger.info(f"GPU available: {torch.cuda.is_available()}")
logger.info(f"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB allocated, {torch.cuda.memory_reserved() / 1024**3:.2f} GB reserved")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB allocated, {torch.cuda.memory_reserved() / 1024**3:.2f} GB reserved")

# Start training with error handling
try:
    logger.info("Starting training...")
    print("Starting training...")
    train_result = trainer.train()
except Exception as e:
    logger.error(f"Training failed with error: {str(e)}")
    print(f"Training failed with error: {str(e)}")
    raise e

# Save training metrics
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
logger.info(f"Training metrics: {metrics}")

# Save the final model
trainer.save_model(config.OUTPUT_DIR)
tokenizer.save_pretrained(config.OUTPUT_DIR)
logger.info(f"Model saved to {config.OUTPUT_DIR}")

# Save training arguments
trainer.save_state()

Currently training with a batch size of: 8
The following columns in the Training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7,477
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 4,680
  Number of trainable parameters = 278,047,493


GPU available: False
GPU memory: 0.00 GB allocated, 0.00 GB reserved
Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Finance Precision,Finance Recall,Finance F1,Technology Precision,Technology Recall,Technology F1,Healthcare Precision,Healthcare Recall,Healthcare F1,Energy Precision,Energy Recall,Energy F1,Other Precision,Other Recall,Other F1
100,1.5165,1.489688,0.595722,0.595722,0.344259,0.551558,0.573311,0.881013,0.694611,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.428571,0.461538,0.684426,0.481268,0.565144
200,1.0123,0.869489,0.73369,0.73369,0.722837,0.738137,0.826211,0.734177,0.77748,0.464286,0.764706,0.577778,0.904762,0.703704,0.791667,0.666667,0.857143,0.75,0.741538,0.694524,0.717262
300,0.8484,0.78264,0.779679,0.779679,0.75728,0.774521,0.754274,0.893671,0.818076,0.804878,0.485294,0.605505,0.741935,0.851852,0.793103,0.82,0.836735,0.828283,0.80678,0.685879,0.741433
400,0.7305,0.581327,0.760428,0.760428,0.736755,0.762479,0.863222,0.718987,0.78453,0.571429,0.705882,0.631579,0.54,1.0,0.701299,0.735537,0.908163,0.812785,0.749288,0.757925,0.753582
500,0.6616,0.648718,0.744385,0.744385,0.701548,0.750733,0.858859,0.724051,0.785714,0.568421,0.794118,0.662577,0.376812,0.962963,0.541667,0.692913,0.897959,0.782222,0.778135,0.697406,0.735562
600,0.7035,0.651904,0.742246,0.742246,0.742839,0.740769,0.904215,0.597468,0.719512,0.666667,0.647059,0.656716,0.577778,0.962963,0.722222,0.892473,0.846939,0.86911,0.648936,0.878963,0.746634


The following columns in the Evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 935
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./xlm-roberta-industry-complete/checkpoint-100
Configuration saved in ./xlm-roberta-industry-complete/checkpoint-100/config.json
Model weights saved in ./xlm-roberta-industry-complete/checkpoint-100/model.safetensors
tokenizer config file saved in ./xlm-roberta-industry-complete/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./xlm-roberta-industry-complete/checkpoint-100/special_tokens_map.json
The following columns in 

In [12]:
# Evaluate on test set
logger.info("Evaluating on test set...")
print("Evaluating on test set...")
test_results = trainer.evaluate(
    tokenized_datasets["test"],
    metric_key_prefix="test"
)

# Save evaluation results
with open(os.path.join(config.OUTPUT_DIR, 'test_results.txt'), 'w') as f:
    for key, value in test_results.items():
        f.write(f"{key}: {value}\n")

logger.info("\n=== Test Results ===")
print("\n=== Test Results ===")
for key, value in test_results.items():
    if key.startswith("test_"):
        logger.info(f"{key[5:]}: {value}")
        print(f"{key[5:]}: {value}")

The following columns in the Evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 935
  Batch size = 8


Evaluating on test set...


early stopping required metric_for_best_model, but did not find eval_f1_macro so early stopping is disabled



=== Test Results ===
loss: 0.75778728723526
accuracy: 0.7893048128342246
f1_micro: 0.7893048128342246
f1_macro: 0.7590246165978148
f1_weighted: 0.7862022694356815
finance_precision: 0.795045045045045
finance_recall: 0.8914141414141414
finance_f1: 0.8404761904761905
technology_precision: 0.8235294117647058
technology_recall: 0.6176470588235294
technology_f1: 0.7058823529411765
healthcare_precision: 0.6451612903225806
healthcare_recall: 0.7407407407407407
healthcare_f1: 0.689655172413793
energy_precision: 0.7980769230769231
energy_recall: 0.8469387755102041
energy_f1: 0.821782178217822
other_precision: 0.7868852459016393
other_recall: 0.6936416184971098
other_f1: 0.7373271889400921
runtime: 378.1333
samples_per_second: 2.473
steps_per_second: 0.309


In [13]:
# Sample predictions function
def predict_industry(text):
    inputs = tokenizer(
        text,
        max_length=config.MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(trainer.model.device)
    
    with torch.no_grad():
        outputs = trainer.model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_class = torch.argmax(probs).item()
    
    return {
        "industry": config.REVERSE_INDUSTRY_MAP[pred_class],
        "confidence": probs[0][pred_class].item(),
        "probabilities": {
            "Finance": probs[0][0].item(),
            "Technology": probs[0][1].item(),
            "Healthcare": probs[0][2].item(),
            "Energy": probs[0][3].item(),
            "Other": probs[0][4].item()
        }
    }

# Test on some samples and save predictions
sample_texts = df.sample(5, random_state=config.SEED)["summary"].tolist()
with open(os.path.join(config.OUTPUT_DIR, 'sample_predictions.txt'), 'w') as f:
    for i, text in enumerate(sample_texts):
        result = predict_industry(text)
        actual = df[df['summary'] == text]['industry'].values[0]
        
        f.write(f"\n=== Sample {i+1} ===\n")
        f.write(f"\nText: {text}\n")
        f.write(f"\nPredicted Industry: {result['industry']} (Confidence: {result['confidence']:.2f})\n")
        f.write(f"Probabilities: {result['probabilities']}\n")
        f.write(f"Actual Industry: {actual}\n")
        
        logger.info(f"Sample {i+1} - Predicted: {result['industry']}, Actual: {actual}")
        print(f"\n=== Sample {i+1} ===")
        print(f"\nText: {text}")
        print(f"\nPredicted Industry: {result['industry']} (Confidence: {result['confidence']:.2f})")
        print(f"Probabilities: {result['probabilities']}")
        print(f"Actual Industry: {actual}")


=== Sample 1 ===

Text: Chủ tịch Hội đồng thành viên EVN tiếp và làm việc với lãnh đạo KEPCO về ứng dụng công nghệ thông minh và phát triển nguồn điện hạt nhân. Các bên muốn tăng cường hợp tác, chia sẻ kinh nghiệm để phát triển dự án điện. EVN đề xuất thành lập tổ công tác chuyên môn. KEPCO đang quản lý 83GW và tham gia đầu tư dự án BOT tại Việt Nam. Nhiều đối tác nước ngoài muốn hợp tác với Việt Nam trong dự án điện hạt nhân Ninh Thuận, bao gồm Mỹ, Hàn Quốc, Nga, Nhật, Trung Quốc, Pháp.

Predicted Industry: Energy (Confidence: 0.94)
Probabilities: {'Finance': 0.013467533513903618, 'Technology': 0.01810125820338726, 'Healthcare': 0.009299561381340027, 'Energy': 0.9440290331840515, 'Other': 0.015102625824511051}
Actual Industry: Energy

=== Sample 2 ===

Text: Từ ngày 1/7, Việt Nam chỉ còn 34 tỉnh, thành phố sau khi sắp xếp lại hành chính từ 63 đơn vị trước đây. Việc này nhằm tạo điều kiện thuận lợi hơn cho các địa phương thu hút vốn đầu tư trực tiếp nước ngoài (FDI). Các “thủ phủ” mới

In [14]:
# Create zip file of all outputs
def zip_output_folder(output_dir):
    zip_path = os.path.join(output_dir, 'output.zip')
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                if file != 'output.zip':
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, output_dir)
                    zipf.write(file_path, arcname)
    return zip_path

output_zip = zip_output_folder(config.OUTPUT_DIR)
logger.info(f"Created zip file at: {output_zip}")

print("Training complete! Download the results:")
from IPython.display import FileLink
FileLink(output_zip)

logger.info("Training process completed successfully")

Training complete! Download the results:
