In [2]:
import json
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers


# --- TEST ---

In [14]:
from tokenizers import Tokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# Load the tokenizer
tokenizer = Tokenizer.from_file("concode_tokenizer.json")

# Example text
text = "this is a sample code"

# Tokenize the text
tokenized_text = tokenizer.encode(text)
print(tokenized_text)

Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


# --- TEST ---

In [3]:
def train_tokenizer_on_concode_data(train_data, vocab_size=30000):
    # Initialize a BPE tokenizer
    tokenizer = Tokenizer(models.BPE())

    # Set the pre-tokenizer and decoder
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()

    # Set the trainer and train the tokenizer
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<s>", "<pad>", "</s>", "<unk>"])
    tokenizer.train_from_iterator(all_texts(train_data), trainer)

    return tokenizer

def all_texts(data):
    for item in data:
        nl = item.get('nl', '')
        code = item.get('code', '')
        yield nl
        yield code

# Load the CONCODE dataset
def load_concode_dataset(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            example = json.loads(line.strip())
            data.append(example)
    return data

In [4]:

def tokenize_concode_dataset(data, tokenizer):
    tokenized_data = []
    for item in data:
        if 'code_tokens' in item and 'docstring_tokens' in item:
            tokenized_code = tokenizer.encode(' '.join(item['code_tokens']))
            tokenized_docstring = tokenizer.encode(' '.join(item['docstring_tokens']))
            tokenized_data.append((tokenized_code, tokenized_docstring))
    print(f"Tokenized {len(tokenized_data)} examples")
    return tokenized_data


train_file = "./data1/python_train_0.jsonl"
train_data = load_concode_dataset(train_file)

print(f"Loaded {len(train_data)} examples from {train_file}")

vocab_size = 30000
tokenizer = train_tokenizer_on_concode_data(train_data, vocab_size)

print(f"Trained tokenizer with {len(train_data)} examples")

# Save the tokenizer
tokenizer.save("concode_tokenizer.json")

# Tokenize the dataset
tokenized_data = tokenize_concode_dataset(train_data, tokenizer)
print(f"Tokenized {len(tokenized_data)} examples")

if len(tokenized_data) >= 5:
    for i in range(5):
        print(tokenized_data[i])
else:
    print(f"tokenized_data has only {len(tokenized_data)} elements.")

# Load the tokenizer
tokenizer = Tokenizer.from_file("concode_tokenizer.json")
print("Loaded tokenizer")


Loaded 30000 examples from ./data1/python_train_0.jsonl
Trained tokenizer with 30000 examples
Tokenized 30000 examples
Tokenized 30000 examples
(Encoding(num_tokens=451, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]))
(Encoding(num_tokens=419, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]))
(Encoding(num_tokens=267, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]))
(Encoding(num_tokens=35, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(n

In [5]:
# Create a dataset and dataloader for the tokenized data
class ConcodeDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

# Create a dataset and dataloader for the tokenized data
dataset = ConcodeDataset(tokenized_data)
print(f"Created dataset with {len(dataset)} examples")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=1)
print(f"Created dataloader with {len(dataloader)} batches")



# Save the tokenizer
tokenizer.save("concode_tokenizer.json")


# Load the tokenizer
tokenizer = Tokenizer.from_file("concode_tokenizer.json")
tokenized_data = tokenize_concode_dataset(train_data, tokenizer)
SOS_IDX = tokenizer.token_to_id('<s>')

Created dataset with 30000 examples
Created dataloader with 938 batches
Tokenized 30000 examples


In [6]:

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x, (hidden, cell))
        predictions = self.fc(outputs.squeeze(1))
        return predictions, hidden, cell

class EncoderDecoderModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(EncoderDecoderModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        # Get the encoder outputs and hidden state
        encoder_outputs, hidden = self.encoder(src)

        # Initialize the decoder input
        decoder_input = torch.tensor([[SOS_IDX]*trg.shape[1]])

        # Move everything to device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        encoder_outputs = encoder_outputs.to(device)
        hidden = hidden.to(device)
        decoder_input = decoder_input.to(device)

        # Initialize the decoder outputs tensor
        decoder_outputs = torch.zeros(trg.shape[0], trg.shape[1], self.decoder.output_dim).to(device)

        # Decode one step at a time
        for t in range(trg.shape[0]):
            output, hidden = self.decoder(decoder_input, hidden, encoder_outputs)
            decoder_outputs[t] = output
            decoder_input = trg[t].unsqueeze(0)

        return decoder_outputs

In [7]:
# Use the Seq2SeqModel class and update it with appropriate arguments for encoder and decoder
input_size = vocab_size
hidden_size = 256
output_size = vocab_size
num_layers = 1

encoder = Encoder(input_size, hidden_size, num_layers)
decoder = Decoder(output_size, hidden_size, num_layers)
model = EncoderDecoderModel(encoder, decoder)

# Train and save the model
def train_and_save_model(model, dataloader, num_epochs=1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    print_every = 1  # Choose a number that suits your dataset size

    for epoch in range(num_epochs):
        running_loss = 0.0

        for i, data in enumerate(dataloader, 0):
            nl_tokens, code_tokens = data
            nl_tokens = nl_tokens.to(device)
            code_tokens = code_tokens.to(device)

            optimizer.zero_grad()
            outputs = model(nl_tokens, code_tokens[:, :-1])
            loss = criterion(outputs.view(-1, outputs.size(-1)), code_tokens[:, 1:].view(-1))
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Print the running loss every 'print_every' iterations
            if (i + 1) % print_every == 0:
                print(f'Epoch {epoch + 1}, Iteration {i + 1}, Running Loss: {running_loss / (i + 1)}')

        print(f'Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}')

    print("Finished training")
    torch.save(model.state_dict(), 'trained_model.pth')

train_and_save_model(model, dataloader, num_epochs=1)