<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/TextClassificationWithoutDistinction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tqdm

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/COBS_All_Labels_Phrases.json', 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

# Extract labels and phrases
labels = [item["label_"] for item in data]
phrases = [item["phrase"] for item in data]

# Split the data into training, validation, and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    phrases, labels, test_size=0.2, random_state=42
)

# Further split the training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

# Load BERT tokenizer and encode the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

unique_labels = set(labels)
num_classes = len(unique_labels)
print(f"Number of unique classes: {num_classes}")

# Create BERT-based text classification model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

# Fine-tuning hyperparameters
batch_size = 64
max_seq_length = 128
learning_rate = 2e-5
num_epochs = 50  # Increase the number of epochs for early stopping
gradient_accumulation_steps = 4

# Create data loaders
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels_encoded))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Optimization and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Early stopping parameters
patience = 3  # Number of epochs to wait for improvement
best_val_accuracy = 0
no_improvement_count = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    tqdm_data = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, batch in tqdm_data:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)

        tqdm_data.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    print(f"Epoch {epoch + 1} - Average Loss: {total_loss / len(train_loader)}")


    # Evaluation on validation data after each epoch
    model.eval()
    with torch.no_grad():
        val_outputs = model(**val_encodings)
        val_predicted_labels = torch.argmax(val_outputs.logits, dim=1)
        val_accuracy = accuracy_score(val_labels_encoded, val_predicted_labels)
        print(f"Validation Accuracy (Epoch {epoch+1}): {val_accuracy:.4f}")
        f1 = f1_score(val_labels_encoded, val_predicted_labels,average='micro')
        print(f"Epoch {epoch + 1} - F1 Score: {f1}")

        # Implement early stopping
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("Early stopping triggered. No improvement for {} epochs.".format(patience))
                break  # Stop training


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Number of unique classes: 9


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/50, Loss: 1.5673: 100%|██████████| 209/209 [27:57<00:00,  8.03s/it]


Epoch 1 - Average Loss: 1.567263059638904
Validation Accuracy (Epoch 1): 0.5213
Epoch 1 - F1 Score: 0.5212694125590817


Epoch 2/50, Loss: 1.0376: 100%|██████████| 209/209 [27:07<00:00,  7.79s/it]


Epoch 2 - Average Loss: 1.0376427612806622
Validation Accuracy (Epoch 2): 0.6104
Epoch 2 - F1 Score: 0.6103983794733289


Epoch 3/50, Loss: 0.8025: 100%|██████████| 209/209 [26:48<00:00,  7.69s/it]


Epoch 3 - Average Loss: 0.8024673091167468
Validation Accuracy (Epoch 3): 0.6300
Epoch 3 - F1 Score: 0.6299797434166104


Epoch 4/50, Loss: 0.6686: 100%|██████████| 209/209 [27:30<00:00,  7.90s/it]


Epoch 4 - Average Loss: 0.6686200371199247
Validation Accuracy (Epoch 4): 0.6543
Epoch 4 - F1 Score: 0.6542876434841324


Epoch 5/50, Loss: 0.5973: 100%|██████████| 209/209 [27:39<00:00,  7.94s/it]


Epoch 5 - Average Loss: 0.5972746925776085
Validation Accuracy (Epoch 5): 0.6347
Epoch 5 - F1 Score: 0.6347062795408508


Epoch 6/50, Loss: 0.5390: 100%|██████████| 209/209 [29:26<00:00,  8.45s/it]


Epoch 6 - Average Loss: 0.5390297610793958
Validation Accuracy (Epoch 6): 0.6469
Epoch 6 - F1 Score: 0.6468602295746118


Epoch 7/50, Loss: 0.5150: 100%|██████████| 209/209 [29:41<00:00,  8.52s/it]


Epoch 7 - Average Loss: 0.5149648058357421
Validation Accuracy (Epoch 7): 0.6367
Epoch 7 - F1 Score: 0.636731937879811
Early stopping triggered. No improvement for 3 epochs.


In [4]:
# Evaluation on test data
from sklearn.metrics import classification_report, f1_score

model.eval()

with torch.no_grad():
    test_outputs = model(**test_encodings)
    test_predicted_labels = torch.argmax(test_outputs.logits, dim=1)
    test_accuracy = accuracy_score(test_labels_encoded, test_predicted_labels)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    f1 = f1_score(test_labels_encoded, test_predicted_labels,average='micro')
    print("Micro F1-Score:", f1)


    '''
    test_report = classification_report(test_labels_encoded, test_predicted_labels)
    print("Test Classification Report:\n", test_report)'''


Test Accuracy: 0.6607
Micro F1-Score: 0.6607239330091842


In [None]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the new JSON file containing phrases
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/samplePhrasesandTags100.json', 'r', encoding='utf-8-sig') as json_file:
    data = json.load(json_file)

# Extract phrases and original labels from the new data
phrases = [item["phrase"] for item in data]
original_labels = [item["label_"] for item in data]

# Load the pre-trained model and tokenizer
model = model(**test_encodings)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and encode the phrases
encodings = tokenizer(phrases, padding=True, truncation=True, return_tensors="pt")

# Load the label encoder (you should have it from your previous code)
# Fit the label encoder with the original labels
label_encoder = LabelEncoder()
label_encoder.fit(original_labels)  # Fit the label encoder with the original labels

# Use the model to predict labels for the phrases
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    predicted_labels = torch.argmax(outputs.logits, dim=1)

# Decode the predicted labels
decoded_labels = label_encoder.inverse_transform(predicted_labels)

# Print the original labels, predicted labels, and predicted label values for each phrase
for phrase, original_label, predicted_label, predicted_label_value in zip(phrases, original_labels, decoded_labels, predicted_labels.tolist()):
    print(f"Phrase: {phrase}")
    print(f"KG Label: {original_label}")
    print(f"Predicted Label: {predicted_label}")
    print()
