BEmoC

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import re
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


from transformers import BertForSequenceClassification,BertTokenizer

print(torch.cuda.is_available())
model = BertForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=6)
tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')


# Load dataset
df = pd.read_excel('bemoc.xlsx')
corpus = df['TEXT'].to_list()
y=df['classes']
end = 5600
train_texts = corpus[0:end]
test_texts = corpus[end:]
train_labels = y[0:end].to_list()
test_labels = y[end:].to_list()
#texts = df["t"]
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=140)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=140)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=140)

# Convert to PyTorch datasets
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)






# Define model
num_labels = len(np.unique(train_labels))


from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

epochs = 20
optim = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=total_steps)

def train(model, loader, optim, scheduler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).long()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Calculate accuracy
        _, predicted_labels = torch.max(outputs.logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optim.step()
        scheduler.step()

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions

    return total_loss / len(loader), accuracy

def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0.0  # Initialize total loss

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            # Assume that model's output is a sequence classifier output with loss stored as outputs.loss
            loss = outputs.loss
            total_loss += loss.item()  # Accumulate the loss

            # Convert predictions and labels for accuracy calculation
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)  # Calculate average loss
    accuracy = accuracy_score(true_labels, predictions)  # Calculate accuracy
    return avg_loss, accuracy


for epoch in range(epochs): 
    train_loss, train_accuracy = train(model, train_loader, optim, scheduler)
    print(f'Epoch {epoch + 1}, Train_Loss: {round(train_loss,4)}, Train_Accuracy: {round(train_accuracy,4)}', end=", ")

    valid_loss, val_accuracy = evaluate(model, val_loader)
    print(f'Val_Loss: {round(valid_loss,4)}, Val_Accuracy: {round(val_accuracy,4)}')





from sklearn.metrics import confusion_matrix, classification_report

# Evaluate model
def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].numpy()  # CPU
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels)

    return predictions, true_labels

# Get predictions and true labels
predictions, true_labels = evaluate(model, test_loader)

# Calculate confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)

class_names = label_encoder.classes_


# Create a pandas DataFrame from the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)

# Display the confusion matrix as a DataFrame with class names as labels
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(conf_matrix_df)

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=class_names))
import seaborn as sn
y_true=['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise']
data = conf_matrix
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
sn.set(font_scale=2)#for label size
sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 24}, fmt="d")

Increasing Data - BEMoC + banglaemotion

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import re
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification,BertTokenizer

tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')

model = BertForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=6)

print(torch.cuda.is_available())

df=pd.read_csv("merged.csv")
corpus = df['text'].to_list()
y=df['class']
end = 11240
train_texts = corpus[0:end]
test_texts = corpus[end:]
train_labels = y[0:end].to_list()
test_labels = y[end:].to_list()
#texts = df["t"]
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=140)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=140)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=140)

# Convert to PyTorch datasets
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)






# Define model
num_labels = len(np.unique(train_labels))


from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

epochs = 20
optim = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=total_steps)

def train(model, loader, optim, scheduler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).long()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Calculate accuracy
        _, predicted_labels = torch.max(outputs.logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optim.step()
        scheduler.step()

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions

    return total_loss / len(loader), accuracy

def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0.0  # Initialize total loss

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            # Assume that model's output is a sequence classifier output with loss stored as outputs.loss
            loss = outputs.loss
            total_loss += loss.item()  # Accumulate the loss

            # Convert predictions and labels for accuracy calculation
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)  # Calculate average loss
    accuracy = accuracy_score(true_labels, predictions)  # Calculate accuracy
    return avg_loss, accuracy


for epoch in range(epochs): 
    train_loss, train_accuracy = train(model, train_loader, optim, scheduler)
    print(f'Epoch {epoch + 1}, Train_Loss: {round(train_loss,4)}, Train_Accuracy: {round(train_accuracy,4)}', end=", ")

    valid_loss, val_accuracy = evaluate(model, val_loader)
    print(f'Val_Loss: {round(valid_loss,4)}, Val_Accuracy: {round(val_accuracy,4)}')





from sklearn.metrics import confusion_matrix, classification_report

# Evaluate model
def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].numpy()  # CPU
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels)

    return predictions, true_labels

# Get predictions and true labels
predictions, true_labels = evaluate(model, test_loader)

# Calculate confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)

class_names = label_encoder.classes_


# Create a pandas DataFrame from the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)

# Display the confusion matrix as a DataFrame with class names as labels
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(conf_matrix_df)

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=class_names))
import seaborn as sn
y_true=['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise']
data = conf_matrix
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
sn.set(font_scale=2)#for label size
sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 24}, fmt="d")

BanglaEmotion

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import re
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score



print(torch.cuda.is_available())

from transformers import BertForSequenceClassification,BertTokenizer

tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')

model = BertForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=6)

df=pd.read_csv("train.csv")
corpus = df['text'].to_list()
y=df['class']
df2 = pd.read_csv("test.csv")
corpus2 = df2['text'].to_list()
y2=df2['class']
corpus.extend(corpus2)
y = pd.concat([y,y2])
end = 4700
train_texts = corpus[0:end]
test_texts = corpus[end:]
train_labels = y[0:end].to_list()
test_labels = y[end:].to_list()
#texts = df["t"]
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=140)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=140)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=140)

# Convert to PyTorch datasets
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)






# Define model
num_labels = len(np.unique(train_labels))


from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

epochs = 20
optim = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=total_steps)

def train(model, loader, optim, scheduler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).long()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Calculate accuracy
        _, predicted_labels = torch.max(outputs.logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optim.step()
        scheduler.step()

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions

    return total_loss / len(loader), accuracy

def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0.0  # Initialize total loss

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            # Assume that model's output is a sequence classifier output with loss stored as outputs.loss
            loss = outputs.loss
            total_loss += loss.item()  # Accumulate the loss

            # Convert predictions and labels for accuracy calculation
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)  # Calculate average loss
    accuracy = accuracy_score(true_labels, predictions)  # Calculate accuracy
    return avg_loss, accuracy


for epoch in range(epochs): 
    train_loss, train_accuracy = train(model, train_loader, optim, scheduler)
    print(f'Epoch {epoch + 1}, Train_Loss: {round(train_loss,4)}, Train_Accuracy: {round(train_accuracy,4)}', end=", ")

    valid_loss, val_accuracy = evaluate(model, val_loader)
    print(f'Val_Loss: {round(valid_loss,4)}, Val_Accuracy: {round(val_accuracy,4)}')





from sklearn.metrics import confusion_matrix, classification_report

# Evaluate model
def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].numpy()  # CPU
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels)

    return predictions, true_labels

# Get predictions and true labels
predictions, true_labels = evaluate(model, test_loader)

# Calculate confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)

class_names = label_encoder.classes_


# Create a pandas DataFrame from the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)

# Display the confusion matrix as a DataFrame with class names as labels
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(conf_matrix_df)

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=class_names))
import seaborn as sn
y_true=['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise']
data = conf_matrix
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
sn.set(font_scale=2)#for label size
sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 24}, fmt="d")