## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
import warnings
warnings.filterwarnings('ignore')

## Check Device (CPU/GPU)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cpu


## Load Transcript Data

Load the transcripts generated by Whisper from the previous notebook.

In [3]:
def load_transcripts(folder_path, label):
    """
    Load all transcript files from a folder.
    
    Args:
        folder_path: Path to folder containing transcript .txt files
        label: 0 for bad, 1 for good
    
    Returns:
        List of dicts with 'text' and 'label' keys
    """
    folder = Path(folder_path)
    
    if not folder.exists():
        print(f"‚ö†Ô∏è  Folder not found: {folder_path}")
        return []
    
    data = []
    txt_files = sorted(folder.glob("*.txt"))
    
    for txt_file in txt_files:
        try:
            text = txt_file.read_text(encoding='utf-8').strip()
            if text:  # Skip empty files
                data.append({
                    'text': text,
                    'label': label
                })
        except Exception as e:
            print(f"‚ùå Error reading {txt_file.name}: {e}")
    
    return data

# Load good answers (label=1)
good_data = load_transcripts("../data/good_answer_transcripts", label=1)
print(f"‚úÖ Loaded {len(good_data)} good answers")

# Load bad answers (label=0)
bad_data = load_transcripts("../data/bad_answer_transcripts", label=0)
print(f"‚ùå Loaded {len(bad_data)} bad answers")

# Combine datasets
all_data = good_data + bad_data
print(f"\nüìä Total samples: {len(all_data)}")

‚úÖ Loaded 33 good answers
‚ùå Loaded 13 bad answers

üìä Total samples: 46


## Prepare Dataset

Split data into training and validation sets.

In [4]:
# Convert to DataFrame
df = pd.DataFrame(all_data)

# Display class distribution
print("Class distribution:")
print(df['label'].value_counts())
print(f"\nGood answers: {(df['label'] == 1).sum()}")
print(f"Bad answers: {(df['label'] == 0).sum()}")

# Split into train/validation (80/20)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # Maintain class balance
)

print(f"\nTrain set: {len(train_texts)} samples")
print(f"Validation set: {len(val_texts)} samples")

Class distribution:
label
1    33
0    13
Name: count, dtype: int64

Good answers: 33
Bad answers: 13

Train set: 36 samples
Validation set: 10 samples


## Load Hebrew DistilBERT Model

In [5]:
# Hebrew DistilBERT from Hugging Face
model_name = "onlplab/alephbert-base"  # AlephBERT (Hebrew BERT)
# Alternative: "avichr/heBERT" or "imvladikon/alephbertgimmel-base-512"

print(f"Loading model: {model_name}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model for binary classification (2 labels: bad=0, good=1)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

print("‚úÖ Model and tokenizer loaded successfully!")

Loading model: onlplab/alephbert-base


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model and tokenizer loaded successfully!


## Tokenize Data

In [6]:
# Create HuggingFace datasets
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'label': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'label': val_labels
})

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

# Apply tokenization
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("‚úÖ Tokenization complete!")

Tokenizing datasets...


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!


## Configure Training

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="../models/customer-support-classifier",
    eval_strategy ="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="../models/logs",
    logging_steps=10,
    save_total_limit=2,  # Keep only 2 best checkpoints
    report_to="none",  # Disable wandb/tensorboard
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Output: {training_args.output_dir}")

Training configuration:
  Epochs: 3
  Batch size: 8
  Learning rate: 2e-05
  Output: ../models/customer-support-classifier


## Define Evaluation Metrics

In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Compute accuracy, precision, recall, F1 for evaluation.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Initialize Trainer

In [10]:
# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized!")

‚úÖ Trainer initialized!


## Train Model

This may take several minutes depending on hardware.

In [11]:
print("Starting training...\n")
train_result = trainer.train()

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print(f"Train loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']:.2f}s")

Starting training...



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Evaluate Model

In [None]:
print("Evaluating on validation set...\n")
eval_result = trainer.evaluate()

print("="*50)
print("EVALUATION RESULTS")
print("="*50)
print(f"Accuracy:  {eval_result['eval_accuracy']:.4f}")
print(f"Precision: {eval_result['eval_precision']:.4f}")
print(f"Recall:    {eval_result['eval_recall']:.4f}")
print(f"F1 Score:  {eval_result['eval_f1']:.4f}")
print(f"Loss:      {eval_result['eval_loss']:.4f}")

## Save Fine-tuned Model

In [None]:
# Save the best model
output_dir = "../models/customer-support-classifier-final"
Path(output_dir).mkdir(parents=True, exist_ok=True)

print(f"Saving model to: {output_dir}")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("\n‚úÖ Model saved successfully!")
print(f"\nTo load this model later:")
print(f"  tokenizer = AutoTokenizer.from_pretrained('{output_dir}')")
print(f"  model = AutoModelForSequenceClassification.from_pretrained('{output_dir}')")

## Test Prediction

Try the model on a sample text.

In [None]:
def predict_quality(text):
    """
    Predict if customer support answer is good (1) or bad (0).
    """
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = predictions.argmax().item()
        confidence = predictions[0][predicted_class].item()
    
    return {
        'prediction': 'Good Answer' if predicted_class == 1 else 'Bad Answer',
        'confidence': confidence,
        'class': predicted_class
    }

# Test on a validation example
if len(val_texts) > 0:
    test_text = val_texts[0]
    result = predict_quality(test_text)
    
    print("Test Prediction:")
    print(f"Text: {test_text[:100]}...")
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: {result['confidence']:.2%}")
    print(f"Actual label: {'Good' if val_labels[0] == 1 else 'Bad'}")