# *Vietnamese News Industry Classification with XLM-RoBERTa*
## This notebook fine-tunes XLM-RoBERTa for classifying the industry of Vietnamese news summaries.
* Dataset: 5179 samples

In [1]:
# Install required packages
!pip install -q transformers datasets evaluate accelerate scikit-learn pandas matplotlib seaborn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5

In [2]:
import os
import shutil
import zipfile
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import logging

2025-07-24 14:08:06.378109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753366086.569347      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753366086.623265      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Configuration
class Config:
    MODEL_NAME = "xlm-roberta-base"
    SEED = 42
    BATCH_SIZE = 8  # Reduced to avoid GPU memory issues
    GRADIENT_ACCUMULATION_STEPS = 2  # Adjusted for effective batch size of 16
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 10
    MAX_LENGTH = 256
    WEIGHT_DECAY = 0.01
    OUTPUT_DIR = "./xlm-roberta-industry-complete"
    LOGGING_STEPS = 10  # Increased frequency for better monitoring
    SAVE_TOTAL_LIMIT = 2
    INDUSTRY_MAP = {'Finance': 0, 'Technology': 1, 'Healthcare': 2, 'Energy': 3, 'Other': 4}
    REVERSE_INDUSTRY_MAP = {0: 'Finance', 1: 'Technology', 2: 'Healthcare', 3: 'Energy', 4: 'Other'}
    EARLY_STOPPING_PATIENCE = 3
    LR_SCHEDULER_TYPE = "cosine"
    WARMUP_RATIO = 0.1
    USE_CLASS_WEIGHTS = True
    DATA_PATH = "/kaggle/input/data-news-v1/data_news_v1.xlsx"

config = Config()

# Create output directory
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

In [4]:
# Set up logging with error handling
try:
    logging.basicConfig(
        filename=os.path.join(config.OUTPUT_DIR, 'training.log'),
        level=logging.DEBUG,  # Increased verbosity
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)
    logger.info("Logging initialized successfully")
except PermissionError:
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    logger.info("Fallback to console logging due to permission error for file: %s", os.path.join(config.OUTPUT_DIR, 'training.log'))
except Exception as e:
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    logger.info("Logging setup failed with error: %s. Fallback to console logging.", str(e))

# Set random seed
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)

In [5]:
# Custom Trainer with class weights
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        labels = labels.long()
        
        if self.class_weights is not None:
            weights = torch.tensor(self.class_weights, device=logits.device, dtype=torch.float32)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
            
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [6]:
# Load and explore dataset
def load_and_explore_data(file_path):
    logger.info("Loading dataset...")
    df = pd.read_excel(file_path)
    
    df['label'] = df['industry'].map(config.INDUSTRY_MAP)
    
    with open(os.path.join(config.OUTPUT_DIR, 'dataset_info.txt'), 'w') as f:
        f.write(f"Total samples: {len(df)}\n")
        f.write("\nIndustry distribution:\n")
        f.write(df['industry'].value_counts().to_string())
    
    plt.figure(figsize=(8, 5))
    class_dist = df['industry'].value_counts()
    sns.barplot(x=class_dist.index, y=class_dist.values)
    plt.title('Industry Distribution')
    plt.ylabel('Count')
    plt.savefig(os.path.join(config.OUTPUT_DIR, 'industry_distribution.png'))
    plt.close()
    
    df['text_length'] = df['summary'].apply(lambda x: len(x.split()))
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(df['text_length'], bins=30)
    plt.title('Text Length Distribution')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x='industry', y='text_length', data=df)
    plt.title('Text Length by Industry')
    plt.savefig(os.path.join(config.OUTPUT_DIR, 'text_length_distribution.png'))
    plt.close()
    
    return df

df = load_and_explore_data(config.DATA_PATH)
df = df.dropna()

if config.USE_CLASS_WEIGHTS:
    class_weights = compute_class_weight(
        'balanced', 
        classes=np.unique(df['label']),
        y=df['label']
    )
    config.CLASS_WEIGHTS = class_weights.tolist()
    logger.info(f"Class weights: {config.CLASS_WEIGHTS}")
else:
    config.CLASS_WEIGHTS = None

tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

  with pd.option_context('mode.use_inf_as_na', True):


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
# Analyze token lengths
def analyze_token_lengths(texts, tokenizer, max_length):
    lengths = []
    for text in texts:
        tokens = tokenizer(text, truncation=True, max_length=max_length)["input_ids"]
        lengths.append(len(tokens))
    return lengths

token_lengths = analyze_token_lengths(df['summary'], tokenizer, config.MAX_LENGTH)

plt.figure(figsize=(10, 5))
sns.histplot(token_lengths, bins=30)
plt.title('Token Length Distribution')
plt.axvline(x=config.MAX_LENGTH, color='r', linestyle='--', label='Max Length')
plt.legend()
plt.savefig(os.path.join(config.OUTPUT_DIR, 'token_length_distribution.png'))
plt.close()

logger.info(f"Percentage of texts within max length: {sum(np.array(token_lengths) <= config.MAX_LENGTH) / len(token_lengths):.2%}")

  with pd.option_context('mode.use_inf_as_na', True):


In [8]:
# Preprocess function
def preprocess_function(examples):
    return tokenizer(
        examples["summary"],
        truncation=True,
        max_length=config.MAX_LENGTH,
        padding="max_length"
    )

# Split data
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=config.SEED,
    stratify=df['label']
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=config.SEED,
    stratify=temp_df['label']
)

train_dataset = Dataset.from_pandas(train_df[['summary', 'industry', 'label']])
val_dataset = Dataset.from_pandas(val_df[['summary', 'industry', 'label']])
test_dataset = Dataset.from_pandas(test_df[['summary', 'industry', 'label']])

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["summary", "industry"]
)

with open(os.path.join(config.OUTPUT_DIR, 'data_splits.txt'), 'w') as f:
    f.write(f"Train samples: {len(train_df)}\n")
    f.write(f"Validation samples: {len(val_df)}\n")
    f.write(f"Test samples: {len(test_df)}\n")

model = AutoModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=5
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2133 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    report = classification_report(
        labels,
        predictions,
        target_names=['Finance', 'Technology', 'Healthcare', 'Energy', 'Other'],
        output_dict=True
    )
    
    metrics = {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'finance_precision': report['Finance']['precision'],
        'finance_recall': report['Finance']['recall'],
        'finance_f1': report['Finance']['f1-score'],
        'technology_precision': report['Technology']['precision'],
        'technology_recall': report['Technology']['recall'],
        'technology_f1': report['Technology']['f1-score'],
        'healthcare_precision': report['Healthcare']['precision'],
        'healthcare_recall': report['Healthcare']['recall'],
        'healthcare_f1': report['Healthcare']['f1-score'],
        'energy_precision': report['Energy']['precision'],
        'energy_recall': report['Energy']['recall'],
        'energy_f1': report['Energy']['f1-score'],
        'other_precision': report['Other']['precision'],
        'other_recall': report['Other']['recall'],
        'other_f1': report['Other']['f1-score']
    }
    
    logger.info(f"Evaluation metrics: {metrics}")
    
    return metrics

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    run_name=f"xlm-roberta-industry-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=config.LOGGING_STEPS,
    save_steps=100,
    save_total_limit=config.SAVE_TOTAL_LIMIT,
    learning_rate=config.LEARNING_RATE,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=config.NUM_EPOCHS,
    weight_decay=config.WEIGHT_DECAY,
    lr_scheduler_type=config.LR_SCHEDULER_TYPE,
    warmup_ratio=config.WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    fp16=True,
    logging_dir="./logs",
    seed=config.SEED,
    report_to="none",  # Disable wandb logging
    log_level="debug"  # Increase logging verbosity
)

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=config.CLASS_WEIGHTS,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)]
)

  super().__init__(**kwargs)
Using auto half precision backend


In [11]:
# Check GPU memory before training
logger.info(f"GPU available: {torch.cuda.is_available()}")
logger.info(f"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB allocated, {torch.cuda.memory_reserved() / 1024**3:.2f} GB reserved")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB allocated, {torch.cuda.memory_reserved() / 1024**3:.2f} GB reserved")

# Start training with error handling
try:
    logger.info("Starting training...")
    print("Starting training...")
    train_result = trainer.train()
except Exception as e:
    logger.error(f"Training failed with error: {str(e)}")
    print(f"Training failed with error: {str(e)}")
    raise e

# Save training metrics
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
logger.info(f"Training metrics: {metrics}")

# Save the final model
trainer.save_model(config.OUTPUT_DIR)
tokenizer.save_pretrained(config.OUTPUT_DIR)
logger.info(f"Model saved to {config.OUTPUT_DIR}")

# Save training arguments
trainer.save_state()

Currently training with a batch size of: 16
The following columns in the Training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2,133
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 670
  Number of trainable parameters = 278,047,493


GPU available: True
GPU memory: 1.04 GB allocated, 1.09 GB reserved
Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Finance Precision,Finance Recall,Finance F1,Technology Precision,Technology Recall,Technology F1,Healthcare Precision,Healthcare Recall,Healthcare F1,Energy Precision,Energy Recall,Energy F1,Other Precision,Other Recall,Other F1
100,0.7745,0.650076,0.726592,0.726592,0.687299,0.73904,0.888889,0.711111,0.790123,0.404762,0.85,0.548387,0.393939,1.0,0.565217,0.786885,0.888889,0.834783,0.881356,0.577778,0.697987
200,0.5079,0.564391,0.797753,0.797753,0.773005,0.800481,0.831461,0.822222,0.826816,0.6,0.75,0.666667,0.565217,1.0,0.722222,0.92,0.851852,0.884615,0.8125,0.722222,0.764706
300,0.2454,0.654852,0.808989,0.808989,0.782815,0.811345,0.829545,0.811111,0.820225,0.652174,0.75,0.697674,0.611111,0.846154,0.709677,0.938776,0.851852,0.893204,0.797753,0.788889,0.793296
400,0.1581,0.694694,0.805243,0.805243,0.786909,0.806089,0.789474,0.833333,0.810811,0.681818,0.75,0.714286,0.714286,0.769231,0.740741,0.92,0.851852,0.884615,0.802326,0.766667,0.784091
500,0.1251,0.762163,0.797753,0.797753,0.776154,0.798529,0.793478,0.811111,0.802198,0.75,0.75,0.75,0.642857,0.692308,0.666667,0.921569,0.87037,0.895238,0.766667,0.766667,0.766667
600,0.078,0.840022,0.808989,0.808989,0.807021,0.809465,0.77551,0.844444,0.808511,0.833333,0.75,0.789474,0.769231,0.769231,0.769231,0.94,0.87037,0.903846,0.772727,0.755556,0.764045


The following columns in the Evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 267
  Batch size = 16
Saving model checkpoint to ./xlm-roberta-industry-complete/checkpoint-100
Configuration saved in ./xlm-roberta-industry-complete/checkpoint-100/config.json
Model weights saved in ./xlm-roberta-industry-complete/checkpoint-100/model.safetensors
tokenizer config file saved in ./xlm-roberta-industry-complete/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./xlm-roberta-industry-complete/checkpoint-100/special_tokens_map.json
The following columns in the Evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0

In [12]:
# Evaluate on test set
logger.info("Evaluating on test set...")
print("Evaluating on test set...")
test_results = trainer.evaluate(
    tokenized_datasets["test"],
    metric_key_prefix="test"
)

# Save evaluation results
with open(os.path.join(config.OUTPUT_DIR, 'test_results.txt'), 'w') as f:
    for key, value in test_results.items():
        f.write(f"{key}: {value}\n")

logger.info("\n=== Test Results ===")
print("\n=== Test Results ===")
for key, value in test_results.items():
    if key.startswith("test_"):
        logger.info(f"{key[5:]}: {value}")
        print(f"{key[5:]}: {value}")

The following columns in the Evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 267
  Batch size = 16


Evaluating on test set...


early stopping required metric_for_best_model, but did not find eval_f1_macro so early stopping is disabled



=== Test Results ===
loss: 1.0174436569213867
accuracy: 0.8089887640449438
f1_micro: 0.8089887640449437
f1_macro: 0.7739784481623146
f1_weighted: 0.8092312359790224
finance_precision: 0.8444444444444444
finance_recall: 0.8539325842696629
finance_f1: 0.8491620111731844
technology_precision: 0.5
technology_recall: 0.5238095238095238
technology_f1: 0.5116279069767442
healthcare_precision: 0.7857142857142857
healthcare_recall: 0.8461538461538461
healthcare_f1: 0.8148148148148148
energy_precision: 0.9259259259259259
energy_recall: 0.9259259259259259
energy_f1: 0.9259259259259259
other_precision: 0.7816091954022989
other_recall: 0.7555555555555555
other_f1: 0.768361581920904
runtime: 3.829
samples_per_second: 69.732
steps_per_second: 4.44


In [13]:
# Sample predictions function
def predict_industry(text):
    inputs = tokenizer(
        text,
        max_length=config.MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(trainer.model.device)
    
    with torch.no_grad():
        outputs = trainer.model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_class = torch.argmax(probs).item()
    
    return {
        "industry": config.REVERSE_INDUSTRY_MAP[pred_class],
        "confidence": probs[0][pred_class].item(),
        "probabilities": {
            "Finance": probs[0][0].item(),
            "Technology": probs[0][1].item(),
            "Healthcare": probs[0][2].item(),
            "Energy": probs[0][3].item(),
            "Other": probs[0][4].item()
        }
    }

# Test on some samples and save predictions
sample_texts = df.sample(5, random_state=config.SEED)["summary"].tolist()
with open(os.path.join(config.OUTPUT_DIR, 'sample_predictions.txt'), 'w') as f:
    for i, text in enumerate(sample_texts):
        result = predict_industry(text)
        actual = df[df['summary'] == text]['industry'].values[0]
        
        f.write(f"\n=== Sample {i+1} ===\n")
        f.write(f"\nText: {text}\n")
        f.write(f"\nPredicted Industry: {result['industry']} (Confidence: {result['confidence']:.2f})\n")
        f.write(f"Probabilities: {result['probabilities']}\n")
        f.write(f"Actual Industry: {actual}\n")
        
        logger.info(f"Sample {i+1} - Predicted: {result['industry']}, Actual: {actual}")
        print(f"\n=== Sample {i+1} ===")
        print(f"\nText: {text}")
        print(f"\nPredicted Industry: {result['industry']} (Confidence: {result['confidence']:.2f})")
        print(f"Probabilities: {result['probabilities']}")
        print(f"Actual Industry: {actual}")


=== Sample 1 ===

Text: Giá điện sinh hoạt đang bù chéo cho sản xuất, gây "méo mó" thị trường, đi ngược Nghị quyết 55. Người dân đang phải trả giá cao hơn để bù cho doanh nghiệp, đặc biệt FDI. Luật Điện lực sửa đổi mới chỉ quy định giảm dần bù chéo. Chuyên gia kiến nghị xóa bỏ bù chéo, cần cải cách giá điện theo cơ chế thị trường, minh bạch chi phí. Cần đẩy nhanh giá điện 2 thành phần, khuyến khích đầu tư tư nhân, nước ngoài vào ngành điện.


Predicted Industry: Energy (Confidence: 1.00)
Probabilities: {'Finance': 0.0009795400546863675, 'Technology': 0.0008036752115003765, 'Healthcare': 0.0008965007727965713, 'Energy': 0.9966979026794434, 'Other': 0.0006224566022865474}
Actual Industry: Energy

=== Sample 2 ===

Text: HoSE xem xét hủy niêm yết bắt buộc cổ phiếu KPF do vi phạm công bố thông tin và chậm nộp BCTC. KPF đang bị đình chỉ giao dịch, cảnh báo, kiểm soát vì chậm nộp BCTC soát xét, có ý kiến ngoại trừ và chậm nộp BCTC kiểm toán. KPF xin gia hạn công bố BCTC do khó khăn đối chiế

In [14]:
# Create zip file of all outputs
def zip_output_folder(output_dir):
    zip_path = os.path.join(output_dir, 'output.zip')
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                if file != 'output.zip':
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, output_dir)
                    zipf.write(file_path, arcname)
    return zip_path

output_zip = zip_output_folder(config.OUTPUT_DIR)
logger.info(f"Created zip file at: {output_zip}")

print("Training complete! Download the results:")
from IPython.display import FileLink
FileLink(output_zip)

logger.info("Training process completed successfully")

Training complete! Download the results:
