In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
import gc
import torch.cuda as cuda
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=256'


In [3]:
if torch.cuda.is_available():
    print("CUDA is available. GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available. GPU: NVIDIA GeForce GTX 1650 SUPER


In [4]:
# Functions for training and tokenizing
def all_texts(data):
    for item in data:
        nl = item.get('nl', '')
        code = item.get('code', '')
        yield nl
        yield code

In [5]:
def train_tokenizer_on_concode_data(train_data, vocab_size=30000):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<s>", "<pad>", "</s>", "<unk>"])
    tokenizer.train_from_iterator(all_texts(train_data), trainer)
    return tokenizer

In [6]:
def load_concode_dataset(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            example = json.loads(line.strip())
            data.append(example)
    return data

In [7]:
def tokenize_concode_dataset(data, tokenizer):
    tokenized_data = []
    for item in data:
        if 'code_tokens' in item and 'docstring_tokens' in item:
            tokenized_code = tokenizer.encode(' '.join(item['code_tokens']))
            tokenized_docstring = tokenizer.encode(' '.join(item['docstring_tokens']))
            tokenized_data.append((tokenized_code, tokenized_docstring))
    return tokenized_data


In [8]:
def pad_collate(batch):
    def pad_sequences(sequences, max_len):
        padded_seqs = torch.zeros(len(sequences), max_len, dtype=torch.long)
        for i, seq in enumerate(sequences):
            padded_seqs[i, :len(seq)] = seq
        return padded_seqs

    code_tokens = [item['code_tokens'] for item in batch]
    nl_tokens = [item['nl_tokens'] for item in batch]

    max_code_len = max([len(t) for t in code_tokens])
    max_nl_len = max([len(t) for t in nl_tokens])

    padded_code_tokens = pad_sequences(code_tokens, max_code_len)
    padded_nl_tokens = pad_sequences(nl_tokens, max_nl_len)

    return {'code_tokens': padded_code_tokens, 'nl_tokens': padded_nl_tokens}


In [9]:
class ConcodeDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        item = self.tokenized_data[idx]
        code_tokens, nl_tokens = item

        # Convert tokenizers.Encoding objects to PyTorch tensors
        code_tokens_tensor = torch.tensor(code_tokens.ids)
        nl_tokens_tensor = torch.tensor(nl_tokens.ids)

        sample = {'code_tokens': code_tokens_tensor, 'nl_tokens': nl_tokens_tensor}
        return sample

# Model classes
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_dim = output_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell, trg):  # Add trg as input argument
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x, (hidden, cell))  # Pass hidden and cell states to LSTM
        output = self.fc(outputs.squeeze(1))
        return output, hidden, cell, trg  # Return trg as well


class EncoderDecoderModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(EncoderDecoderModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        hidden, cell = self.encoder(src)  # Unpack hidden and cell here
        batch_size = src.size(0)  # Add this line to get the batch size
        decoder_input = torch.tensor([SOS_IDX] * batch_size, device=src.device).unsqueeze(1)  # Modify this line to match the batch size
        decoder_outputs = torch.zeros(trg.shape[0], trg.shape[1], self.decoder.output_dim, device=src.device)

        for t in range(trg.shape[1]):
            output, hidden, cell = self.decoder(decoder_input, hidden, cell) # Use cell here
            decoder_outputs[:, t] = output
            decoder_input = trg[:, t].unsqueeze(1)

        return decoder_outputs


In [23]:
def train_and_save_model(model, dataloader, device, num_epochs=1, accumulation_steps=4):
    model.to(device)
    if torch.cuda.device_count() > 1:
        print("Using ", torch.cuda.device_count(), " GPUs")
        model = nn.DataParallel(model)
    try:
        output = model(nl_tokens.to(device))  # Pass the nl_tokens tensor here
    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"Out of memory error occurred on device {device}. Trying another device...")
            for i in range(1, torch.cuda.device_count()):
                try:
                    print(f"Trying device {i}...")
                    model.to(f"cuda:{i}")
                    output = model(nl_tokens.to(f"cuda:{i}"))  # Pass the nl_tokens tensor here
                    break
                except RuntimeError as e:
                    if "out of memory" in str(e):
                        print(f"Out of memory error occurred on device {i}. Trying next device...")
                    else:
                        raise e
        else:
            raise e

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scaler = GradScaler()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)
    print_loss_total = 0  # initialize print_loss_total to 0
    loss_plot = []  # initialize loss_plot to an empty list
    print_every = 1

    torch.cuda.empty_cache()
    gc.collect()

    for epoch in range(num_epochs):
        running_loss = 0.0
        optimizer.zero_grad()

        for i, data in enumerate(dataloader, 0):
            nl_tokens = data['nl_tokens'].to(device)
            code_tokens = data['code_tokens'].to(device)

            try:
                outputs = model(nl_tokens)
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), code_tokens[:, 1:].reshape(-1))
            except RuntimeError as e:
                print(f"Caught CUDA error: {e}")
                # Print memory usage summary
                print(cuda.memory_summary())
                # Clear the GPU memory and garbage collect
                torch.cuda.empty_cache()
                gc.collect()
                continue

            with autocast():
                outputs = model(nl_tokens)
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), code_tokens[:, 1:].reshape(-1))

            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            running_loss += loss.item()

            if (i + 1) % print_every == 0:
                print(f'Epoch {epoch + 1}, Iteration {i + 1}, Running Loss: {running_loss / (i + 1)}')

        epoch_loss = running_loss / len(dataloader)
        scheduler.step(epoch_loss)
        print(f'Epoch {epoch + 1}, Loss: {epoch_loss}')

    print("Finished training")
    torch.save(model.state_dict(), 'trained_model.pth')


In [24]:
def main():
    train_file = "./data1/python_train_13.jsonl"
    train_data = load_concode_dataset(train_file)
    print(f"Loaded {len(train_data)} examples from {train_file}")

    vocab_size = 30000
    tokenizer = train_tokenizer_on_concode_data(train_data, vocab_size)
    print(f"Trained tokenizer with {len(train_data)} examples")
    tokenizer.save("concode_tokenizer3.json")

    tokenized_data = tokenize_concode_dataset(train_data, tokenizer)
    print(f"Tokenized {len(tokenized_data)} examples")

    tokenizer = Tokenizer.from_file("concode_tokenizer3.json")
    print("Loaded tokenizer")

    global SOS_IDX  # Declare SOS_IDX as a global variable
    SOS_IDX = tokenizer.token_to_id('<s>')

    dataset = ConcodeDataset(tokenized_data)
    print(f"Created dataset with {len(dataset)} examples")
    concode_dataset = ConcodeDataset(tokenized_data)
    dataloader = DataLoader(concode_dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=pad_collate)

    print(len(concode_dataset))


    print(f"Created dataloader with {len(dataloader)} batches")

    input_size = vocab_size
    hidden_size = 64
    output_size = vocab_size
    num_layers = 1

    encoder = Encoder(input_size, hidden_size, num_layers)
    decoder = Decoder(output_size, hidden_size, num_layers)
    model = EncoderDecoderModel(encoder, decoder)

    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print("the len of the dataset is: " + str(len(concode_dataset)))
        nl_tokens = concode_dataset[0][0]  # Get the input tokens from the first example in the dataset
        train_and_save_model(model, dataloader, device, nl_tokens=nl_tokens, num_epochs=1, accumulation_steps=8)
        for i in range(5):
            input_tokens, output_tokens = concode_dataset[i]
            print(f"Example {i}: input length={len(input_tokens)}, output length={len(output_tokens)}")

    except torch.cuda.CudaError as e:
        print(f"Caught CUDA error: {e}")
        # Clear the GPU memory and garbage collect
        torch.cuda.empty_cache()
        gc.collect()
        # Print memory usage summary
        print(torch.cuda.memory_summary())




if __name__ == '__main__':
    try:
        main()
    except RuntimeError as e:
        print(f"Caught CUDA error: {e}")
        # Clear the GPU memory and garbage collect
        torch.cuda.empty_cache()
        gc.collect()
        # Print memory usage summary
        print(torch.cuda.memory_summary()) 

Loaded 22178 examples from ./data1/python_train_13.jsonl
Trained tokenizer with 22178 examples
Tokenized 22178 examples
Loaded tokenizer
Created dataset with 22178 examples
22178
Created dataloader with 5545 batches
the len of the dataset is: 22178


KeyError: 0