In [None]:
import pandas as pd

# Load the TSV file
file_path = "/content/spoc-train-train.tsv"  # Replace with your actual file path
df = pd.read_csv(file_path, sep='\t')

# Display the first few rows
len(df)

246086

In [None]:
import os
import csv
import pandas as pd

# Constants used in processing
PRE, ENDL, END = "[PRE]", "[ENDL]", "[END]"

input_file = "/content/spoc-train-train.tsv"
output_file = "/content/processed_dataset.csv"

# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Parameters for processing
chunk_size = 1000  # Number of rows per chunk

# Open the output file and use csv.writer
with open(output_file, 'w', encoding='utf-8', newline='') as f_out:
    writer = csv.writer(f_out)
    # Write CSV header
    writer.writerow(["X", "Y"])

    # Process the TSV file in chunks
    worker_pending_y = {}  # Store pending Y values for each worker

    for chunk in pd.read_csv(input_file, sep='\t', chunksize=chunk_size):
        for index, row in chunk.iterrows():
            worker_id = row["workerid"]
            pseudo_code = row["text"]
            code = row["code"]

            # Ensure pseudo_code and code are strings
            pseudo_code = str(pseudo_code).strip()
            code = str(code).strip()

            # If pseudo_code (X) is NaN, store Y with END token and continue
            if pd.isna(row["text"]):
                worker_pending_y[worker_id] = f"{code} {END}"
                continue

            # Retrieve and merge any pending Y if the worker ID matches
            if worker_id in worker_pending_y:
                merged_y = f"{worker_pending_y[worker_id]} {code}".strip()
                del worker_pending_y[worker_id]  # Clear stored Y after merging
            else:
                merged_y = code  # Normal case without merging

            # Format X and Y with [PRE] and [ENDL]
            x_input = f"{PRE} {pseudo_code} {ENDL}"
            y_output = f"{PRE} {merged_y} {ENDL}"

            # Write the processed row to the CSV file
            writer.writerow([x_input, y_output])

print("Processing complete. Saved as processed_dataset.csv.")


Processing complete. Saved as processed_dataset.csv.


In [None]:
file_path = "/content/processed_dataset.csv"  # Replace with your actual file path
df = pd.read_csv(file_path, sep=',')

# Display the first few rows
df

Unnamed: 0,X,Y
0,[PRE] create string s [ENDL],[PRE] int main() { [END] string s; [ENDL]
1,"[PRE] create integers x1, y1, x2, y2 [ENDL]","[PRE] int x1, y1, x2, y2; [ENDL]"
2,[PRE] read s [ENDL],[PRE] cin >> s; [ENDL]
3,[PRE] set x1 to s[0] - 96 [ENDL],[PRE] x1 = s[0] - 96; [ENDL]
4,[PRE] set y1 to s[1] - '0' [ENDL],[PRE] y1 = s[1] - '0'; [ENDL]
...,...,...
181857,[PRE] for i = 0 to l exclusive [ENDL],[PRE] for (int i = 0; i < l; i++) { [ENDL]
181858,[PRE] if not result of run check(s[i]) and s[i...,[PRE] if (!check(s[i]) && s[i] != 'n') { [ENDL]
181859,[PRE] if i + 1 is l or result of run check(s[i...,[PRE] if (i + 1 == l || !check(s[i + 1])) { [E...
181860,"[PRE] print ""NO"" and newline [ENDL]","[PRE] cout << ""NO"" << endl; [ENDL]"


In [None]:
from google.colab import files
files.download('/content/processed_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import torch
from collections import Counter

# Load the dataset
df = pd.read_csv('/content/processed_dataset.csv', delimiter=',')

# Create vocabulary function
def create_vocab(texts, min_freq=1):
    vocab = Counter()
    for text in texts:
        tokens = text.lower().split()  # Simple whitespace tokenization
        vocab.update(tokens)
    # Map tokens to indices (starting at 2) and add special tokens
    vocab = {token: idx+2 for idx, (token, freq) in enumerate(vocab.items()) if freq >= min_freq}
    vocab['[PAD]'] = 0  # Padding token
    vocab['[UNK]'] = 1  # Unknown token
    return vocab

# Tokenizer function: tokenize text using the vocabulary and pad/truncate to max_length
def tokenize(text, vocab, max_length):
    tokens = text.lower().split()
    token_ids = [vocab.get(token, vocab['[UNK]']) for token in tokens]
    # Pad if too short, or truncate if too long
    if len(token_ids) < max_length:
        token_ids += [vocab['[PAD]']] * (max_length - len(token_ids))
    else:
        token_ids = token_ids[:max_length]
    return token_ids

# Creating vocabulary from both 'X' and 'Y' columns
all_texts = pd.concat([df['X'], df['Y']])
vocab = create_vocab(all_texts)

# Determine maximum token length in X and Y columns
max_length_x = df['X'].apply(lambda x: len(x.lower().split())).max()
max_length_y = df['Y'].apply(lambda x: len(x.lower().split())).max()
max_length = max(max_length_x, max_length_y)

print("Max token length for X:", max_length_x)
print("Max token length for Y:", max_length_y)
print("Using max token length:", max_length)

# Tokenize data using the computed maximum length
input_ids, target_ids = [], []
for idx, row in df.iterrows():
    x_input = row['X']
    y_output = row['Y']
    x_tokenized = tokenize(x_input, vocab, max_length)
    y_tokenized = tokenize(y_output, vocab, max_length)
    input_ids.append(x_tokenized)
    target_ids.append(y_tokenized)

# Convert lists to PyTorch tensors
input_tensor = torch.tensor(input_ids)
target_tensor = torch.tensor(target_ids)

print("Input tensor shape:", input_tensor.shape)
print("Target tensor shape:", target_tensor.shape)


Max token length for X: 62
Max token length for Y: 39
Using max token length: 62
Input tensor shape: torch.Size([181862, 62])
Target tensor shape: torch.Size([181862, 62])


In [None]:
idx_to_word = {idx: word for word, idx in vocab.items()}
print(idx_to_word)
print(vocab)

{2: '[pre]', 3: 'create', 4: 'string', 5: 's', 6: '[endl]', 7: 'integers', 8: 'x1,', 9: 'y1,', 10: 'x2,', 11: 'y2', 12: 'read', 13: 'set', 14: 'x1', 15: 'to', 16: 's[0]', 17: '-', 18: '96', 19: 'y1', 20: 's[1]', 21: "'0'", 22: 'x2', 23: 'print', 24: 'maximum', 25: 'of', 26: 'absolute', 27: 'value', 28: 'and', 29: 'y2,', 30: 'newline', 31: 'while', 32: 'is', 33: 'not', 34: 'or', 35: 'if', 36: 'greater', 37: 'than', 38: '"r"', 39: 'increment', 40: 'less', 41: '"l"', 42: 'decrement', 43: '"d"', 44: '"u"', 45: '"\\n"', 46: '=', 47: '0', 48: '(x1', 49: 'x2)', 50: '(y1', 51: 'y2)', 52: '!=', 53: 'true,', 54: 'do', 55: 'the', 56: 'following', 57: ',', 58: 'r', 59: 'l', 60: 'd', 61: 'u', 62: 'a', 63: 'new', 64: 'line', 65: 'character', 66: 'array', 67: 'c1', 68: 'size', 69: '2', 70: 'c2', 71: 'c1[0]', 72: 'c1[1]', 73: 'c2[0]', 74: 'c2[1]', 75: 'vector', 76: 'ans', 77: 'add', 78: 'element', 79: '"ru"', 80: 'end', 81: 'else', 82: '"rd"', 83: '"ld"', 84: '"lu"', 85: 'for', 86: 'i', 87: 'exclusive

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=62):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

# Transformer Model
class TransformerSeq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=8,
                 num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=512, dropout=0.1, max_len=5000):
        super().__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout, max_len)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers,
                                          num_decoder_layers, dim_feedforward, dropout, batch_first=True)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.d_model = d_model

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

    def encode(self, src):
        src_emb = self.src_tok_emb(src) * math.sqrt(self.d_model)
        src_emb = self.positional_encoding(src_emb)
        memory = self.transformer.encoder(src_emb)
        return memory

    def decode(self, tgt, memory):
        tgt_emb = self.tgt_tok_emb(tgt) * math.sqrt(self.d_model)
        tgt_emb = self.positional_encoding(tgt_emb)
        tgt_mask = self.generate_square_subsequent_mask(tgt_emb.size(1)).to(tgt.device)
        output = self.transformer.decoder(tgt_emb, memory, tgt_mask=tgt_mask)
        return self.fc_out(output)

    def forward(self, src, tgt):
        memory = self.encode(src)
        return self.decode(tgt, memory)

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, input_tensor, target_tensor):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor

    def __len__(self):
        return len(self.input_tensor)

    def __getitem__(self, idx):
        return self.input_tensor[idx], self.target_tensor[idx]

# Model Training
def train_model(model, dataloader, optimizer, criterion, scheduler, device, epochs=5):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        i=0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            i=i+1
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            optimizer.zero_grad()
            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, model.fc_out.out_features), tgt_output.reshape(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            print("batch no ", i, " loss = ", loss.item())

        # Adjust learning rate
        scheduler.step()

        print(f"Epoch {epoch+1}: Loss = {epoch_loss/len(dataloader):.4f}, LR = {scheduler.get_last_lr()[0]:.6f}")

    print("Training Complete")


# Example Setup
src_vocab_size = len(vocab)
tgt_vocab_size = len(vocab)
# input_tensor = torch.randint(0, src_vocab_size, (1000, 20))
# target_tensor = torch.randint(0, tgt_vocab_size, (1000, 20))
dataset = CustomDataset(input_tensor, target_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = TransformerSeq2Seq(src_vocab_size, tgt_vocab_size)
optimizer = optim.Adam(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # Reduce LR by 0.5 every 5 epochs
criterion = nn.CrossEntropyLoss()

train_model(model, dataloader, optimizer, criterion, scheduler, torch.device("cuda" if torch.cuda.is_available() else "cpu"))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
batch no  687  loss =  0.14355915784835815
batch no  688  loss =  0.12672492861747742
batch no  689  loss =  0.1489865779876709
batch no  690  loss =  0.12732818722724915
batch no  691  loss =  0.17828527092933655
batch no  692  loss =  0.12442640215158463
batch no  693  loss =  0.10265852510929108
batch no  694  loss =  0.14881666004657745
batch no  695  loss =  0.11086449027061462
batch no  696  loss =  0.1524227261543274
batch no  697  loss =  0.0828372985124588
batch no  698  loss =  0.10775657743215561
batch no  699  loss =  0.1357451230287552
batch no  700  loss =  0.09992016106843948
batch no  701  loss =  0.12431278824806213
batch no  702  loss =  0.15244624018669128
batch no  703  loss =  0.16401220858097076
batch no  704  loss =  0.1541161686182022
batch no  705  loss =  0.11550414562225342
batch no  706  loss =  0.2157675325870514
batch no  707  loss =  0.14227992296218872
batch no  708  loss =  0.1453436315059

In [8]:
import torch

def inference(model, input_sentence, vocab, idx_to_word, device, max_length):
    model.eval()  # Set model to evaluation mode

    # Tokenize input and convert to tensor
    input_tokens = tokenize(input_sentence, vocab, max_length)
    input_tensor = torch.tensor([vocab['[pre]']] + input_tokens + [vocab['[endl]']]).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        memory = model.encode(input_tensor)  # Encode input

    # Start with the [pre] token
    tgt_tokens = [vocab['[pre]']]

    for _ in range(max_length):  # Generate up to max_length tokens
        tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0).to(device)  # Add batch dimension

        with torch.no_grad():
            output = model.decode(tgt_tensor, memory)  # Decode step

        next_token = torch.argmax(output[:, -1, :], dim=-1).item()  # Get next token ID

        if next_token == vocab['[endl]']:  # Stop if [endl] token is generated
            break

        tgt_tokens.append(next_token)

    # Convert token IDs back to words
    output_sentence = ' '.join(idx_to_word.get(idx, '[UNK]') for idx in tgt_tokens[1:])  # Remove [pre]
    return output_sentence


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example input sentence
input_sentence = "print \"YES\" and newline"

output_sentence = inference(model, input_sentence, vocab, idx_to_word, device, max_length)
print("Generated Output:", output_sentence)


Generated Output: cout << "yes" << endl;


In [12]:
import torch

# Assume model is your trained PyTorch model
torch.save(model.state_dict(), 'model.pth')


In [14]:
from google.colab import files
files.download('model.pth')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
import json

with open('vocab.json', 'w') as f:
    json.dump(vocab, f)


In [16]:
with open('vocab.json', 'r') as f:
    vocab = json.load(f)


In [17]:
print(vocab)

{'[pre]': 2, 'create': 3, 'string': 4, 's': 5, '[endl]': 6, 'integers': 7, 'x1,': 8, 'y1,': 9, 'x2,': 10, 'y2': 11, 'read': 12, 'set': 13, 'x1': 14, 'to': 15, 's[0]': 16, '-': 17, '96': 18, 'y1': 19, 's[1]': 20, "'0'": 21, 'x2': 22, 'print': 23, 'maximum': 24, 'of': 25, 'absolute': 26, 'value': 27, 'and': 28, 'y2,': 29, 'newline': 30, 'while': 31, 'is': 32, 'not': 33, 'or': 34, 'if': 35, 'greater': 36, 'than': 37, '"r"': 38, 'increment': 39, 'less': 40, '"l"': 41, 'decrement': 42, '"d"': 43, '"u"': 44, '"\\n"': 45, '=': 46, '0': 47, '(x1': 48, 'x2)': 49, '(y1': 50, 'y2)': 51, '!=': 52, 'true,': 53, 'do': 54, 'the': 55, 'following': 56, ',': 57, 'r': 58, 'l': 59, 'd': 60, 'u': 61, 'a': 62, 'new': 63, 'line': 64, 'character': 65, 'array': 66, 'c1': 67, 'size': 68, '2': 69, 'c2': 70, 'c1[0]': 71, 'c1[1]': 72, 'c2[0]': 73, 'c2[1]': 74, 'vector': 75, 'ans': 76, 'add': 77, 'element': 78, '"ru"': 79, 'end': 80, 'else': 81, '"rd"': 82, '"ld"': 83, '"lu"': 84, 'for': 85, 'i': 86, 'exclusive,': 

In [28]:
import torch

def inference_line_by_line(model, input_pseudo_code, vocab, idx_to_word, device, max_length):
    model.eval()  # Set model to evaluation mode
    generated_lines = []

    for line in input_pseudo_code.strip().split("\n"):  # Process each line separately
        input_tokens = tokenize(line, vocab, max_length)
        input_tensor = torch.tensor([vocab['[pre]']] + input_tokens).unsqueeze(0).to(device)

        with torch.no_grad():
            memory = model.encode(input_tensor)  # Encode input

        # Start with the [pre] token
        tgt_tokens = [vocab['[pre]']]
        output_sentence = []

        for _ in range(max_length):
            tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0).to(device)

            with torch.no_grad():
                output = model.decode(tgt_tensor, memory)  # Decode step

            next_token = torch.argmax(output[:, -1, :], dim=-1).item()
            tgt_tokens.append(next_token)

            word = idx_to_word.get(next_token, '[UNK]')

            if word == "[endl]":
                break  # Stop generation immediately if [endl] appears

            # Replace [end] token with a newline
            if word == "[end]":
                output_sentence.append("\n")
            else:
                output_sentence.append(word)

        # Convert list to string and append
        generated_lines.append(" ".join(output_sentence))

    return "\n".join(generated_lines)  # Ensure proper formatting


# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example pseudo-code input
pseudo_code = """\
BEGIN
INPUT a, b
SUM = a + b
IF SUM > 10 THEN
    PRINT "Large Sum"
ELSE
    PRINT "Small Sum"
END IF

"""

# Run inference
output_result = inference_line_by_line(model, pseudo_code, vocab, idx_to_word, device, max_length)

print("Generated Output:")
print(output_result)


Generated Output:
int main() { 
 int a, b, c, d;
cin >> a >> b;
sum += a;
if (sum > 10) {
cout << "up" << endl;
else
cout << "up" << endl;
if (flag) {
;
