In [18]:
import pandas as pd
import torch
from transformers import BertTokenizer

# Read CSV file
df = pd.read_csv("/home/yadagiri/train_dataset.csv")

# Sample text and numerical data from CSV columns
text_column = "text"  # Replace "text_column_name" with the actual name of your text column
numerical_columns = ['Vocabulary', 'Noun Count', 'Verb Count', 'AUX Count', 'NUM Count', 'PRON Count', 'ADV Count', 'INTJ Count', 'PART Count']  # Replace with the actual numerical column names
y_label_column = "label"  # Replace "label_column_name" with the actual name of your label column

# Tokenize text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_text = df[text_column].apply(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=128, return_tensors='pt'))

# Concatenate text and numerical data
df['combined_input'] = df.apply(lambda row: torch.cat((torch.tensor(row[numerical_columns], dtype=torch.float), tokenized_text[row.name]['input_ids'].squeeze(), tokenized_text[row.name]['attention_mask'].squeeze()), dim=0), axis=1)

# Combine features and labels
features = torch.stack(df['combined_input'].tolist())
labels = torch.tensor(df[y_label_column].tolist(), dtype=torch.long)

# Print shapes of features and labels
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)

# Now you can feed `features` and `labels` into your BERT model


Features shape: torch.Size([12150, 265])
Labels shape: torch.Size([12150])


In [19]:
X =list(features)

In [20]:
y =labels

In [21]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix


In [22]:
# Split dataset into train and validation sets
train_X, val_X, train_y, val_y = train_test_split(X,y, test_size=0.2, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # import pdb;pdb.set_trace()
        text = self.texts[idx]
        label = self.labels[idx]  # Convert label to integer
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming 2 classes for binary classification
batch_size = 32
epochs = 3
# Create DataLoader for train and validation sets
train_dataset = CustomDataset(train_X, train_y, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(val_X, val_y, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
# optimizer = AdamW(model.parameters(), lr=learning_rate)
# criterion = torch.nn.CrossEntropyLoss()

# Create DataLoader for train and validation sets
# train_dataset = CustomDataset(train_df['text'], train_df['binary_label'], tokenizer)
# val_dataset = CustomDataset(val_df['text'], val_df['binary_label'], tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define training parameters

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Initialize empty lists to store loss and accuracy values
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
train_precisions = []
val_precisions = []
train_recalls = []
val_recalls = []
train_f1_scores = []
val_f1_scores = []

for epoch in range(epochs):
    model.train()
    epoch_train_loss = 0
    correct_train_preds = 0
    total_train_preds = 0
    train_predicted_labels = []
    train_true_labels = []
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item()
        _, predicted = torch.max(outputs.logits, 1)
        correct_train_preds += (predicted == labels).sum().item()
        total_train_preds += labels.size(0)
        train_predicted_labels.extend(predicted.cpu().numpy())
        train_true_labels.extend(labels.cpu().numpy())

    # Calculate training accuracy and loss for the epoch
    train_loss = epoch_train_loss / len(train_loader)
    train_accuracy = correct_train_preds / total_train_preds
    train_precision = precision_score(train_true_labels, train_predicted_labels, average='weighted')
    train_recall = recall_score(train_true_labels, train_predicted_labels, average='weighted')
    train_f1 = f1_score(train_true_labels, train_predicted_labels, average='weighted')

    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1_scores.append(train_f1)

    # Validation loop
    model.eval()
    epoch_val_loss = 0
    correct_val_preds = 0
    total_val_preds = 0
    val_predicted_labels = []
    val_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            epoch_val_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            correct_val_preds += (predicted == labels).sum().item()
            total_val_preds += labels.size(0)
            val_predicted_labels.extend(predicted.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())

    # Calculate validation accuracy and loss for the epoch
    val_loss = epoch_val_loss / len(val_loader)
    val_accuracy = correct_val_preds / total_val_preds
    val_precision = precision_score(val_true_labels, val_predicted_labels, average='weighted')
    val_recall = recall_score(val_true_labels, val_predicted_labels, average='weighted')
    val_f1 = f1_score(val_true_labels, val_predicted_labels, average='weighted')

    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1_scores.append(val_f1)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1: {train_f1}")
    print(f"Val Loss: {val_loss}, Val Accuracy: {val_accuracy}, Val Precision: {val_precision}, Val Recall: {val_recall}, Val F1: {val_f1}")


ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).