In [None]:
# We will first load in the standard Kryptonite-n dataset for a 
# given n 
import tempfile
import os
import numpy as np
import torch
from tqdm import tqdm
from torch import nn, optim
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2Model

selected_n_value = 9

X = np.load('finaldata/kryptonite-%s-X.npy'%(selected_n_value))
y = np.load('finaldata/kryptonite-%s-y.npy'%(selected_n_value))

# Assuming X and y are your dataset feature and label vectors.
# Cast labels to integers if necessary
y = y.astype(int)

# 1. Splitting the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 2. Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2Model.from_pretrained('gpt2')

# Function to encode input vectors using GPT-2 and write results to temporary files
def encode_and_save(X, batch_size=300, prefix='encoded'):
    temp_files = []
    
    for i in range(0, len(X), batch_size):
        batch = X[i:i + batch_size]
        encodings = []
        
        # Process each vector in the batch
        for vector in tqdm(batch):
            input_sentence = "Classify the following vector from a binary classification task: %s" % vector
            inputs = tokenizer(input_sentence, return_tensors='pt')
            outputs = gpt2_model(**inputs)
            encoding = outputs.last_hidden_state.mean(dim=1)
            encodings.append(encoding)
        
        # Stack the encodings for the current batch
        encodings_tensor = torch.stack(encodings)
        
        # Write the batch to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pt', prefix=prefix) as tmp_file:
            temp_files.append(tmp_file.name)
            torch.save(encodings_tensor, tmp_file.name)
        
        # Free memory
        del encodings_tensor
        torch.cuda.empty_cache()  # If using a GPU
    print("Finished with one")
    return temp_files

# Merge multiple temp files into a single tensor
def merge_temp_files(temp_files):
    all_encodings = []
    
    for file in temp_files:
        encodings_tensor = torch.load(file)
        all_encodings.append(encodings_tensor)
        os.remove(file)  # Clean up temp file after loading
    
    # Concatenate all the loaded tensors into one
    return torch.cat(all_encodings)

# Encode the train, validation, and test sets in batches and store them in temp files
train_temp_files = encode_and_save(X_train, prefix='train_encoded')
val_temp_files = encode_and_save(X_val[0:1000], prefix='val_encoded')
test_temp_files = encode_and_save(X_test[0:1000], prefix='test_encoded')

# Merge the temporary files into one dataset
X_train_encoded = merge_temp_files(train_temp_files)
X_val_encoded = merge_temp_files(val_temp_files)
X_test_encoded = merge_temp_files(test_temp_files)



In [None]:
np.save('finaldata/kryptonite-%s-X-train-GPT2.npy'%(selected_n_value), X_train_encoded.detach().numpy())
np.save('finaldata/kryptonite-%s-X-valid-GPT2.npy'%(selected_n_value), X_val_encoded.detach().numpy())
np.save('finaldata/kryptonite-%s-X-test-GPT2.npy'%(selected_n_value), X_test_encoded.detach().numpy())

np.save('finaldata/kryptonite-%s-y-train-GPT2.npy'%(selected_n_value), y_train)
np.save('finaldata/kryptonite-%s-y-valid-GPT2.npy'%(selected_n_value), y_val)
np.save('finaldata/kryptonite-%s-y-test-GPT2.npy'%(selected_n_value), y_test)
