<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/COBS_All_Labels_Phrases_Distinct.json', 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

# Extract labels and phrases
labels = [item["label_"] for item in data]
phrases = [item["phrase"] for item in data]

# Split the data into training, validation, and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    phrases, labels, test_size=0.2, random_state=42
)

# Further split the training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

# Load BERT tokenizer and encode the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

unique_labels = set(labels)
num_classes = len(unique_labels)
print(f"Number of unique classes: {num_classes}")

# Create BERT-based text classification model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

# Fine-tuning hyperparameters
batch_size = 64
max_seq_length = 128
learning_rate = 1e-5
num_epochs = 50  # Increase the number of epochs for early stopping
gradient_accumulation_steps = 4

# Create data loaders
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels_encoded))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Optimization and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Early stopping parameters
patience = 3  # Number of epochs to wait for improvement
best_val_accuracy = 0
no_improvement_count = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    tqdm_data = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, batch in tqdm_data:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)

        tqdm_data.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation on validation data after each epoch
    model.eval()
    with torch.no_grad():
        val_outputs = model(**val_encodings)
        val_predicted_labels = torch.argmax(val_outputs.logits, dim=1)
        val_accuracy = accuracy_score(val_labels_encoded, val_predicted_labels)
        print(f"Validation Accuracy (Epoch {epoch+1}): {val_accuracy:.4f}")

        # Implement early stopping
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("Early stopping triggered. No improvement for {} epochs.".format(patience))
                break  # Stop training


In [None]:
# Evaluation on test data
from sklearn.metrics import classification_report

model.eval()

with torch.no_grad():
    test_outputs = model(**test_encodings)
    test_predicted_labels = torch.argmax(test_outputs.logits, dim=1)
    test_accuracy = accuracy_score(test_labels_encoded, test_predicted_labels)

    print(f"Test Accuracy: {test_accuracy:.4f}")
    test_report = classification_report(test_labels_encoded, test_predicted_labels)
    print("Test Classification Report:\n", test_report)


In [None]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the new JSON file containing phrases
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/samplePhrasesandTags100.json', 'r', encoding='utf-8-sig') as json_file:
    data = json.load(json_file)

# Extract phrases and original labels from the new data
phrases = [item["phrase"] for item in data]
original_labels = [item["label_"] for item in data]

# Load the pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and encode the phrases
encodings = tokenizer(phrases, padding=True, truncation=True, return_tensors="pt")

# Load the label encoder (you should have it from your previous code)
# Fit the label encoder with the original labels
label_encoder = LabelEncoder()
label_encoder.fit(original_labels)  # Fit the label encoder with the original labels

# Use the model to predict labels for the phrases
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    predicted_labels = torch.argmax(outputs.logits, dim=1)

# Decode the predicted labels
decoded_labels = label_encoder.inverse_transform(predicted_labels)

# Print the original labels, predicted labels, and predicted label values for each phrase
for phrase, original_label, predicted_label, predicted_label_value in zip(phrases, original_labels, decoded_labels, predicted_labels.tolist()):
    print(f"Phrase: {phrase}")
    print(f"KG Label: {original_label}")
    print(f"Predicted Label: {predicted_label}")
    print()
