# **Classification of app reviews for requirements engineering using deep learning models**

In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import datetime
import gc
import os
import sys
import time

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch import amp
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader

from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup
)


### **Data Preparation**

In [None]:
# Load dataset
file_path = 'drive/MyDrive/ColabNotebooks/3yr_project/multi-class/unbalanced/gpt_unbalanced_16000.xlsx'
dataframe = pd.read_excel(file_path)


In [None]:
# Dataset splitting
train_df, temp_df = train_test_split(
    dataframe, test_size=0.2, stratify=dataframe['label'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


### **Model Configuration**

In [None]:
# Model Configuration
MODEL_CONFIGS = {
    'llama': {
        'MODEL_NAME': 'meta-llama/Llama-3.2-1B',
        # 'MODEL_NAME': 'meta-llama/Meta-Llama-3-8B',
        'NUM_LABELS': 4,
        'BATCH_SIZE': 16,
        'LEARNING_RATE': 5e-6,
        'WEIGHT_DECAY': 0.01,
        'EPOCHS': 3,
        'PATIENCE': 3
    }
}

### **Data Preprocessing**

In [None]:
# Data preprocessing
def preprocess_data(example, tokenizer):
    texts = [text if isinstance(text, str) else "" for text in example['review']]
    tokenized = tokenizer(texts,
                          padding='max_length',
                          truncation=True,
                          max_length=256)
    tokenized['labels'] = example['new_label']
    return tokenized

### **Model Loading and Saving**

In [None]:
# Load tokenizer and model
def load_model_and_tokenizer(model_name, num_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        trust_remote_code=True
    )

    model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer

In [None]:
# Define a function to save the model
def save_model(model, tokenizer, path):
    try:
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)
        print(f"Model and tokenizer saved to {path}")
        return True
    except Exception as e:
        print(f"Model saving failed: {e}")
        return False

### **Model Training**

In [None]:
# Training function
def train_model(model, tokenizer, train_dataloader, val_dataloader,
                optimizer, scheduler, device, loss_fn, epochs,
                patience, save_path, backup_path, model_type):
    train_losses, val_losses = [], []
    epochs_no_improve = 0
    best_model_path = None
    best_val_loss = float('inf') # set to infinite at beginning
    scaler = amp.GradScaler()

    # Training and Validation Loop
    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0

        for batch in train_dataloader:

            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            # Forward pass
            with amp.autocast(device_type='cuda'):
                outputs = model(**inputs)
                loss = loss_fn(outputs.logits, labels)

            # Backward pass
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        # Calculate average training loss
        avg_train_loss = total_loss / len(train_dataloader)
        train_losses.append(avg_train_loss)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:

                inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
                labels = batch['labels'].to(device)

                # Forward pass
                outputs = model(**inputs)
                loss = loss_fn(outputs.logits, labels)

                val_loss += loss.item()

        # Calculate average validation loss
        avg_val_loss = val_loss / len(val_dataloader)
        val_losses.append(avg_val_loss)

        # Memory Cleanup
        torch.cuda.empty_cache()
        gc.collect()

        # Print epoch metrics
        print(f"Epoch {epoch + 1:}")
        print(f"Training Loss:   {avg_train_loss}")
        print(f"Validation Loss: {avg_val_loss}")

        # Save the best model based on validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            saved = save_model(model, tokenizer, save_path)
            if saved:
                best_model_path = save_path
            else:
                backup_saved = save_model(model, tokenizer, backup_path)
                if backup_saved:
                    best_model_path = backup_path
                else:
                    print("Failed to save the model to both primary and backup paths. Stopping training.")
                    break
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping due to no improvement in validation loss.")
                break

    return train_losses, val_losses, best_model_path

### **Model Evaluation**

In [None]:
def evaluate_model(dataloader, device, target_names, best_model_path, report_path):
    if best_model_path:
        model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
        model.to(device)
        model.eval()

        all_labels, all_preds = [], []
        with torch.no_grad():
            for batch in dataloader:
                inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
                labels = batch['labels'].to(device)
                outputs = model(**inputs)
                preds = outputs.logits.argmax(dim=1).cpu().numpy()

                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(preds)

        report_dict = classification_report(all_labels, all_preds, target_names=target_names, output_dict=True)
        report_df = pd.DataFrame(report_dict).transpose()

        report_df = report_df.round(4)

        print("Classification Report:")
        print(report_df)

        # Save the classification report
        report_df.to_csv(report_path, float_format='%.4f')
        print(f"Classification report saved to {report_path}")

        macro_f1 = report_dict["macro avg"]["f1-score"]
        print(f"\nMacro F1 Score: {macro_f1:.4f}")

### **Plot Loss Curve**

In [None]:
# Draw the loss curve
def plot_loss(train_losses, val_losses):
    plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
    plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Curve')
    plt.legend(loc='upper right')
    plt.show()

### **Record Training Time**

In [None]:
def record_training_time(start_time, end_time, model_key):
    total_time = end_time - start_time
    with open("training_times.txt", "a") as f:
        f.write(f"{model_key} Training Time: {total_time:.2f} seconds\n")
    print(f"Total Training Time for {model_key}: {total_time:.2f} seconds")

### **Run Model**

In [None]:
def run_model(model_key, train_dataset, val_dataset, test_dataset):
    start_time = time.time()
    config = MODEL_CONFIGS[model_key]
    model_name = config['MODEL_NAME']

    model, tokenizer = load_model_and_tokenizer(config['MODEL_NAME'], config['NUM_LABELS'])

    # Apply preprocessing to train, validation, and test datasets
    train_dataset = train_dataset.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})
    val_dataset = val_dataset.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})
    test_dataset = test_dataset.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})

    # Convert datasets to PyTorch format
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # Set device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define the loss function and optimizer for multi-class tasks
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=config['LEARNING_RATE'], weight_decay=config['WEIGHT_DECAY'])

    # Total training steps and Warm-Up steps
    total_steps = len(train_dataset) * config['EPOCHS']
    warmup_steps = int(0.1 * total_steps)  # warm-Up for the first 10% of training steps

    # Warm-up learning rate scheduler
    # scheduler = get_linear_schedule_with_warmup(
    #     optimizer,
    #     num_warmup_steps=warmup_steps,
    #     num_training_steps=total_steps,
    # )

    # cosine annealing
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
        num_cycles=0.5
    )

    # Create DataLoaders for training, validation and testing
    train_dataloader = DataLoader(train_dataset, batch_size=config['BATCH_SIZE'], shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=config['BATCH_SIZE'])
    test_dataloader = DataLoader(test_dataset, batch_size=config['BATCH_SIZE'])

    # Directory to save the model
    save_dir = os.path.join(".", "drive/MyDrive/ColabNotebooks/3yr_project/multi-class/models", model_key)
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = os.path.join(save_dir, f"best_model_{timestamp}")
    backup_path = os.path.join(save_dir, f"backup_model_{timestamp}")

    train_losses, val_losses, best_model_path = train_model(
        model,
        tokenizer,
        train_dataloader,
        val_dataloader,
        optimizer,
        scheduler,
        device,
        loss_fn,
        epochs=config['EPOCHS'],
        patience=config['PATIENCE'],
        save_path=save_path,
        backup_path=backup_path,
        model_type=model_key
    )

    end_time = time.time()

    plot_loss(train_losses, val_losses)

    # Save classification report
    report_path = os.path.join(save_dir, f"classification_report_{timestamp}.csv")

    # Evaluate on the test set
    evaluate_model(
        test_dataloader,
        device,
        target_names=['bug report', 'feature request', 'rating', 'user experience'],
        best_model_path=best_model_path,
        report_path=report_path
    )

    record_training_time(start_time, end_time, model_key)

# Train models

In [None]:
# Replace with your Hugging Face token

In [None]:
# Train Llama model
run_model('llama', train_dataset, val_dataset, test_dataset)