In [None]:
# Install required packages
!pip install scikit-learn joblib

# VNTC Text Classification Training

This notebook trains multiple text classification models on the VNTC dataset using TF-IDF features and different classifiers.

In [None]:
import os
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import time
import json
import joblib

## Setup Paths

In [None]:
dataset_folder = os.path.expanduser("~/.underthesea/VNTC")
output_folder = os.path.expanduser("~/.underthesea/models")
train_file = os.path.join(dataset_folder, "train.txt")
test_file = os.path.join(dataset_folder, "test.txt")

## Load Training Data

In [None]:
# Read and parse training data
print("Reading train.txt...")
X_train_raw = []
y_train = []
with open(train_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            label = parts[0].replace('__label__', '')
            text = parts[1]
            y_train.append(label)
            X_train_raw.append(text)

print(f"Train samples: {len(X_train_raw)}")
print(f"Unique labels: {len(set(y_train))}")
print(f"Labels: {sorted(set(y_train))[:10]}...")  # Show first 10 labels

## Load Test Data

In [None]:
# Read and parse test data
print("Reading test.txt...")
X_test_raw = []
y_test = []
with open(test_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            label = parts[0].replace('__label__', '')
            text = parts[1]
            y_test.append(label)
            X_test_raw.append(text)

print(f"Test samples: {len(X_test_raw)}")

## Configure Experiments

In [None]:
# Configuration options for experiments
model_version = "UTS-C1"
max_features_options = [10000, 20000, 30000]
ngram_options = [(1, 2), (1, 3)]
classifier_options = [
    ('LogisticRegression', LogisticRegression(max_iter=1000, random_state=42)),
    ('SVC', SVC(kernel='linear', random_state=42, probability=True))
]

# Store results for all experiments
results = []

## Run Experiments

In [None]:
# Run experiments with different configurations
for max_features in max_features_options:
    for ngram_range in ngram_options:
        for clf_name, classifier in classifier_options:
            config_name = f"{model_version}_feat{max_features//1000}k_ngram{ngram_range[0]}-{ngram_range[1]}_{clf_name}"
            print("\n" + "="*60)
            print(f"Training: {config_name}")
            print("="*60)

            # Create TF-IDF pipeline
            print(f"Creating pipeline with max_features={max_features}, ngram_range={ngram_range}, classifier={clf_name}")
            text_clf = Pipeline([
                ('vect', CountVectorizer(max_features=max_features, ngram_range=ngram_range)),
                ('tfidf', TfidfTransformer(use_idf=True)),
                ('clf', classifier)
            ])

            # Train the model
            print("Training model...")
            start_time = time.time()
            text_clf.fit(X_train_raw, y_train)
            train_time = time.time() - start_time
            print(f"Training completed in {train_time:.2f} seconds")

            # Evaluate on training set
            print("Evaluating on training set...")
            train_predictions = text_clf.predict(X_train_raw)
            train_accuracy = accuracy_score(y_train, train_predictions)
            print(f"Training accuracy: {train_accuracy:.4f}")

            # Evaluate on test set
            print("Evaluating on test set...")
            start_time = time.time()
            test_predictions = text_clf.predict(X_test_raw)
            test_accuracy = accuracy_score(y_test, test_predictions)
            prediction_time = time.time() - start_time
            print(f"Test accuracy: {test_accuracy:.4f}")
            print(f"Prediction time: {prediction_time:.2f} seconds")

            # Store results
            result = {
                'model_version': model_version,
                'config_name': config_name,
                'max_features': max_features,
                'ngram_range': ngram_range,
                'classifier': clf_name,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy,
                'train_time': train_time,
                'prediction_time': prediction_time
            }
            results.append(result)

            # Show classification report for first 5 classes
            print("\nClassification Report (first 5 classes):")
            unique_labels = sorted(set(y_train))[:5]
            report = classification_report(y_test, test_predictions, labels=unique_labels, zero_division=0, output_dict=True)

            # Save the model with configuration name
            os.makedirs(output_folder, exist_ok=True)
            model_filename = os.path.join(output_folder, f'{config_name}.pkl')
            joblib.dump(text_clf, model_filename)
            print(f"Model saved to {model_filename}")

## Display Experiment Summary

In [None]:
# Print summary of all experiments
print("\n" + "="*80)
print("EXPERIMENT SUMMARY")
print("="*80)
print(f"{'Config':<50} {'Train Acc':<10} {'Test Acc':<10} {'Train Time':<12} {'Pred Time':<10}")
print("-"*80)
for result in sorted(results, key=lambda x: x['test_accuracy'], reverse=True):
    print(f"{result['config_name']:<50} {result['train_accuracy']:.4f}     {result['test_accuracy']:.4f}      {result['train_time']:>8.2f}s    {result['prediction_time']:>6.2f}s")

## Save Best Model

In [None]:
# Save best model as the main model
best_result = max(results, key=lambda x: x['test_accuracy'])
print(f"\nBest configuration: {best_result['config_name']} with test accuracy: {best_result['test_accuracy']:.4f}")

# Load and save best model as main model
best_model_path = os.path.join(output_folder, f"{best_result['config_name']}.pkl")
best_model = joblib.load(best_model_path)
main_model_path = os.path.join(output_folder, 'vntc_classifier.pkl')
joblib.dump(best_model, main_model_path)
print(f"Best model saved as main model to {main_model_path}")

## Save Results and Metadata

In [None]:
# Save results to JSON
results_file = os.path.join(output_folder, f'{model_version}_results.json')
with open(results_file, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {results_file}")

# Save label mapping for reference
label_mapping_filename = os.path.join(output_folder, 'label_mapping.txt')
with open(label_mapping_filename, 'w', encoding='utf-8') as f:
    for label in sorted(set(y_train)):
        f.write(f"{label}\n")
print(f"Label mapping saved to {label_mapping_filename}")