In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import pandas as pd
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# BERT on TOEFL

In [None]:
id2label = {0: "ARA", 1: "CHI", 2: "FRE", 3: "GER", 4: "HIN", 5: "ITA", 6: "JPN", 7: "KOR", 8: "SPA", 9: "TEL", 10: "TUR"}
label2id = {"ARA": 0, "CHI": 1, "FRE": 2, "GER": 3, "HIN": 4, "ITA": 5, "JPN": 6, "KOR": 7, "SPA": 8, "TEL": 9, "TUR": 10}

# initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=11, id2label=id2label, label2id=label2id)

# define dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=510, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label2id[label])}

# convert dataframe to dataset
toefl_train = "/content/drive/MyDrive/thesis_NLI/TOEFL11/train.csv"
toefl_test = "/content/drive/MyDrive/thesis_NLI/TOEFL11/test.csv"
df = pd.read_csv(toefl_train)
test_df = pd.read_csv(toefl_test)
train_dataset = TextDataset(df['text'].tolist(), df['language'].tolist())
test_dataset = TextDataset(test_df['text'].tolist(), test_df['language'].tolist())

# create data loaders
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True) # batch size of 12
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False)

# training
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
for epoch in range(12):  # 12 epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# evaluating on test set
model.eval()
test_predictions = []
test_predictions_lang = []
test_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(outputs.logits, dim=1)
        test_predictions.extend(predicted_labels.tolist())
        test_labels.extend(labels.tolist())
for predicted_label in test_predictions:
    predicted_lang = id2label[predicted_label]
    test_predictions_lang.append(predicted_lang) # save predictions in list
print(f"Num predictions: {len(test_predictions)}")
model.save_pretrained(f"/content/drive/MyDrive/thesis_NLI/TOEFL11/finetuned_bert") # Local saving
tokenizer.save_pretrained(f"/content/drive/MyDrive/thesis_NLI/TOEFL11/finetuned_bert")

accuracy = accuracy_score(test_labels, test_predictions)
print(f"Accuracy: {accuracy}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num predictions: 1100
Accuracy: 0.7527272727272727


# Add results to CSV

In [None]:
# add results to CSV
toefl_results = "/content/drive/MyDrive/thesis_NLI/TOEFL11/toefl_results.csv"
df = pd.read_csv(toefl_results)
column_name = 'preds_finetuned_bert'
num_columns = len(df.columns)
df.insert(num_columns, column_name, test_predictions_lang)
df.to_csv(toefl_results, index=False)

In [None]:
print(test_predictions_lang)

['JPN', 'ARA', 'GER', 'KOR', 'CHI', 'GER', 'GER', 'JPN', 'TUR', 'TEL', 'JPN', 'JPN', 'ITA', 'TUR', 'SPA', 'TUR', 'FRE', 'HIN', 'SPA', 'JPN', 'HIN', 'ITA', 'TEL', 'TUR', 'CHI', 'CHI', 'CHI', 'JPN', 'GER', 'TUR', 'JPN', 'FRE', 'ITA', 'SPA', 'JPN', 'JPN', 'SPA', 'JPN', 'ARA', 'GER', 'TUR', 'HIN', 'FRE', 'KOR', 'ARA', 'ARA', 'JPN', 'ITA', 'ITA', 'SPA', 'HIN', 'ITA', 'GER', 'ITA', 'JPN', 'GER', 'TUR', 'TUR', 'ARA', 'GER', 'JPN', 'TEL', 'TUR', 'CHI', 'HIN', 'GER', 'SPA', 'HIN', 'KOR', 'ARA', 'CHI', 'KOR', 'JPN', 'SPA', 'HIN', 'TUR', 'HIN', 'KOR', 'TUR', 'TUR', 'FRE', 'TUR', 'ARA', 'GER', 'CHI', 'TEL', 'TEL', 'HIN', 'FRE', 'HIN', 'ARA', 'KOR', 'ITA', 'JPN', 'KOR', 'ITA', 'FRE', 'HIN', 'KOR', 'HIN', 'JPN', 'JPN', 'GER', 'JPN', 'TUR', 'SPA', 'ARA', 'SPA', 'KOR', 'GER', 'FRE', 'HIN', 'GER', 'TEL', 'TEL', 'ARA', 'TUR', 'GER', 'KOR', 'JPN', 'JPN', 'TUR', 'JPN', 'FRE', 'HIN', 'ARA', 'ITA', 'SPA', 'HIN', 'SPA', 'HIN', 'SPA', 'ARA', 'JPN', 'FRE', 'SPA', 'HIN', 'SPA', 'KOR', 'HIN', 'TUR', 'ITA', 'TUR'