In [1]:
import torch
import torch.nn.functional as F
import os
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import accuracy_score, f1_score
np.set_printoptions(suppress=True, precision=3, edgeitems=10, linewidth=200)

test_x_df = pd.read_csv("/kaggle/input/latsis-experiments/mock_test_set.csv")["text"].tolist()
text_y_df = pd.read_csv("/kaggle/input/latsis-experiments/mock_test_labels.csv")["label"].tolist()

test_x_df = test_x_df[:2000]
text_y_df = text_y_df[:2000]

test_y = np.array(text_y_df, dtype=np.float32)
test_y = np.round(test_y)

def encode_texts(tokenizer, texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)


stuff = os.listdir("/kaggle/working/ensemble/")
for folder in stuff:
    if folder.__contains__("."):# or not folder.__contains__("swissbert"):
        continue

    print("processing " + folder)

    model = torch.load("/kaggle/working/ensemble/" + folder + "/model.pt").cuda()
    tokenizer = torch.load("/kaggle/working/ensemble/" + folder + "/tokenizer.pt")
    if folder.__contains__("swissbert"):
        model.set_default_language("de_CH")

    model.eval()

    test_x, test_attention_mask = encode_texts(tokenizer, test_x_df)

    BS = 5
    predictions = []
    for i in range(0, len(test_x), BS):
        batch = test_x[i:i+BS].cuda()
        batch_attention_mask = test_attention_mask[i:i+BS].cuda()

        with torch.no_grad():  # Deactivate autograd engine to reduce memory usage
            prediction = model(batch, attention_mask=batch_attention_mask).logits.squeeze(-1)
        predictions.append(prediction.cpu().detach())  # Detach before moving to CPU

        del batch
        del batch_attention_mask
        del prediction
        torch.cuda.empty_cache()  # Free up memory
        if i % 100 == 0:
            print(i)
    
    predictions = torch.cat(predictions)
    predictions = predictions.detach().numpy()
    #print(predictions)

    #check f1
    predictions2 = np.round(F.sigmoid(torch.tensor(predictions)).numpy())
    # print(predictions2)
    # print(test_y)
    print("f1: " + str(f1_score(test_y, predictions2, average='macro')))

    del model
    del tokenizer
    del test_x
    del test_attention_mask
    torch.cuda.empty_cache()  # Free up memory
    gc.collect()  # Trigger Python garbage collection

    np.save("/kaggle/working/ensemble/" + folder + "/predictions.npy", predictions)


processing gelectra
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
[-4.737 -4.211  1.678  0.399 -5.08  -2.838  3.709 -2.921 -5.411 -2.713 ... -5.245 -1.826 -3.293 -5.199  2.943  0.994 -5.316 -5.384 -5.465 -0.615]
processing mdebertav3
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
[-3.457 -3.743  0.55  -2.819 -4.592 -1.382  3.414  2.274 -4.465  1.606 ... -4.858 -2.455 -4.244 -4.709 -1.445 -0.313 -4.715 -3.898 -4.553 -2.209]
processing semantic
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
[-3.034 -6.41   2.634 -2.649 -6.52  -4.691  6.098  1.025 -6.7   -2.069 ... -6.769  0.807 -5.747 -4.898  4.509 -6.056 -6.545 -2.746 -4.732 -0.951]
processing xlm_roberta_base
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
[-1.657 -2.62   0.593  2.409 -5.043 -2.095  4.179 -0.085 -5.042 -4.253 ... -5.693 -0.361 -3.874  2.558 -0.201 -1.76  -4.

In [3]:
#average predictions
all_predictions = []
for folder in stuff:
    if folder.__contains__("."):
        continue
    if folder.__contains__("swissbert"):
        continue
    predictions = np.load("/kaggle/working/ensemble/" + folder + "/predictions.npy")
    all_predictions.append(predictions)

all_predictions = np.mean(all_predictions, axis=0)
all_predictions = F.sigmoid(torch.from_numpy(all_predictions)).numpy()
print(all_predictions.shape)

#save to a csv file
df = pd.DataFrame(all_predictions, columns=["pred"])
df.to_csv("/kaggle/working/predictions.csv", index=False)

print("f1: " + str(f1_score(test_y, np.round(all_predictions), average='macro')))

(2000,)
