In [1]:
import torch
import os
import pandas as pd
import numpy as np
import gc  # For garbage collection
np.set_printoptions(suppress=True, precision=3, edgeitems=10, linewidth=200)

test = pd.read_parquet("../test.parquet")
test = test[:260]
test_y = torch.tensor(np.array(test['label'].tolist()), dtype=torch.float32)

def encode_texts(tokenizer, texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)


stuff = os.listdir("./")
for folder in stuff:
    if folder.__contains__("."):
        continue

    print("processing " + folder)

    model = torch.load(folder + "/model.pt").cuda()
    tokenizer = torch.load(folder + "/tokenizer.pt")

    test_x, test_attention_mask = encode_texts(tokenizer, test['text'])

    BS = 5
    predictions = []
    for i in range(0, len(test_x), BS):
        batch = test_x[i:i+BS].cuda()
        batch_attention_mask = test_attention_mask[i:i+BS].cuda()

        with torch.no_grad():  # Deactivate autograd engine to reduce memory usage
            prediction = model(batch, batch_attention_mask).logits.squeeze(-1)
        predictions.append(prediction.cpu().detach())  # Detach before moving to CPU

        del batch
        del batch_attention_mask
        del prediction
        torch.cuda.empty_cache()  # Free up memory
        if i % 100 == 0:
            print(i)
    
    predictions = torch.cat(predictions)
    predictions = predictions.detach().numpy()
    print(predictions)

    del model
    del tokenizer
    del test_x
    del test_attention_mask
    torch.cuda.empty_cache()  # Free up memory
    gc.collect()  # Trigger Python garbage collection

    np.save(folder + "/predictions.npy", predictions)


processing gelectra
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
[-4.737 -4.211  1.678  0.399 -5.08  -2.838  3.709 -2.921 -5.411 -2.713 -4.965 -2.886  0.254 -2.167  4.205 -0.42   2.191  0.325  3.467  2.771  0.568 -3.55   1.764 -3.771 -5.235  2.241 -0.777 -5.021
  0.336  4.492  3.703  1.298 -2.384 -1.702  3.084  2.887 -1.151 -4.262  4.407 -3.581 -4.66  -0.599 -3.625 -2.847 -1.163  1.507 -5.051 -4.222  2.701 -5.096  4.615  1.967 -4.495 -5.217 -5.561  4.416
 -4.893  3.262  3.238 -5.329 -4.715 -5.307  4.121 -5.281 -1.727 -3.681 -2.165 -0.696 -2.444 -4.951 -5.009 -3.735 -2.716 -2.38  -1.271 -0.168  4.19  -4.325 -0.911 -5.389 -4.075 -5.117 -5.596  3.229
 -4.993 -4.439  2.546 -5.281  0.004 -2.911 -3.97  -5.13  -2.192  4.351  0.927 -4.798  0.963 -5.199 -2.332 -5.297 -2.562 -0.207 -5.347 -4.394  2.006 -5.048 -5.35   1.043 -4.915 -4.54   3.512  1.662
  2.19

In [5]:
#average predictions
all_predictions = []
for folder in stuff:
    if folder.__contains__("."):
        continue
    predictions = np.load(folder + "/predictions.npy")
    all_predictions.append(predictions)

all_predictions = np.mean(all_predictions, axis=0)
print(all_predictions.shape)

#save to a csv file
df = pd.DataFrame(all_predictions, columns=["pred"])
df.to_csv("predictions.csv", index=False)

(260,)
