In [22]:
!pip uninstall -y torch torchtext torchvision
!pip install torch==2.0.1 torchvision==0.15.2 torchtext==0.15.2


Found existing installation: torch 2.0.1
Uninstalling torch-2.0.1:
  Successfully uninstalled torch-2.0.1
Found existing installation: torchtext 0.15.2
Uninstalling torchtext-0.15.2:
  Successfully uninstalled torchtext-0.15.2
Found existing installation: torchvision 0.15.2
Uninstalling torchvision-0.15.2:
  Successfully uninstalled torchvision-0.15.2
Collecting torch==2.0.1
  Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchvision==0.15.2
  Using cached torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (11 kB)
Collecting torchtext==0.15.2
  Using cached torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
Using cached torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
Using cached torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: torch, torchvision, torchtext
[31mERROR: pip's dependency resolve

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import time
import torch.utils.data as data
import os



In [24]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [25]:
# Load dataset
file_path = "/kaggle/input/spoc-train/spoc-train.tsv"  # Update this if needed
df = pd.read_csv(file_path, sep='\t')


In [26]:
# Remove NaN values
df = df.dropna(subset=['text', 'code'])

In [27]:
# Tokenizer (Basic)
def tokenize(text):
    return text.lower().split()

In [28]:
# Build vocabulary
class Vocab:
    def __init__(self, texts, min_freq=1):
        self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx2word = ["<PAD>", "<SOS>", "<EOS", "<UNK>"]
        self.freqs = {}
        
        for text in texts:
            for word in tokenize(text):
                self.freqs[word] = self.freqs.get(word, 0) + 1
                
                if word not in self.word2idx and self.freqs[word] >= min_freq:
                    self.word2idx[word] = len(self.idx2word)
                    self.idx2word.append(word)
        
    def encode(self, text):
        return [self.word2idx.get(word, self.word2idx["<UNK>"]) for word in tokenize(text)] + [self.word2idx["<EOS>"]]
    
    def decode(self, tokens):
        return " ".join([self.idx2word[token] for token in tokens if token > 2])

In [29]:
# Prepare vocabularies
source_vocab = Vocab(df['text'].tolist())
target_vocab = Vocab(df['code'].tolist())

In [30]:
# Dataset class
class CodeDataset(Dataset):
    def __init__(self, df, source_vocab, target_vocab):
        self.df = df
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        source = self.source_vocab.encode(self.df.iloc[idx]['text'])
        target = self.target_vocab.encode(self.df.iloc[idx]['code'])
        return torch.tensor(source), torch.tensor(target)


In [31]:
# Collate function
def collate_fn(batch):
    sources, targets = zip(*batch)
    sources = pad_sequence(sources, padding_value=0, batch_first=True)
    targets = pad_sequence(targets, padding_value=0, batch_first=True)
    return sources, targets


In [32]:
# Create DataLoader
dataset = CodeDataset(df, source_vocab, target_vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [33]:
# Transformer Model
class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim=256, n_heads=8, ff_dim=512, num_layers=3, dropout=0.1):
        super().__init__()
        
        self.encoder_embedding = nn.Embedding(input_dim, emb_dim)
        self.decoder_embedding = nn.Embedding(output_dim, emb_dim)
        
        self.transformer = nn.Transformer(
            d_model=emb_dim, nhead=n_heads, num_encoder_layers=num_layers,
            num_decoder_layers=num_layers, dim_feedforward=ff_dim, dropout=dropout
        )
        
        self.fc_out = nn.Linear(emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src, tgt):
        src = self.encoder_embedding(src).permute(1, 0, 2)
        tgt = self.decoder_embedding(tgt).permute(1, 0, 2)
        
        output = self.transformer(src, tgt)
        output = self.fc_out(output).permute(1, 0, 2)
        
        return output


In [34]:
# Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerSeq2Seq(len(source_vocab.idx2word), len(target_vocab.idx2word)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [35]:
# Training Loop
def train(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        start_time = time.time()
        epoch_loss = 0
        
        for src, tgt in tqdm(dataloader):
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            
            output = model(src, tgt[:, :-1])
            output = output.reshape(-1, output.shape[-1])
            tgt = tgt[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        end_time = time.time()
        print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(dataloader):.4f}, Time: {end_time - start_time:.2f}s")

In [36]:
# Train model
train(model, dataloader, criterion, optimizer, epochs=10)


100%|██████████| 6758/6758 [04:32<00:00, 24.79it/s]


Epoch 1, Loss: 1.3862, Time: 272.65s


100%|██████████| 6758/6758 [04:32<00:00, 24.80it/s]


Epoch 2, Loss: 0.7715, Time: 272.48s


100%|██████████| 6758/6758 [04:34<00:00, 24.60it/s]


Epoch 3, Loss: 0.5966, Time: 274.68s


100%|██████████| 6758/6758 [04:33<00:00, 24.70it/s]


Epoch 4, Loss: 0.4992, Time: 273.57s


100%|██████████| 6758/6758 [04:34<00:00, 24.63it/s]


Epoch 5, Loss: 0.4354, Time: 274.33s


100%|██████████| 6758/6758 [04:33<00:00, 24.74it/s]


Epoch 6, Loss: 0.4042, Time: 273.15s


100%|██████████| 6758/6758 [04:34<00:00, 24.61it/s]


Epoch 7, Loss: 0.3875, Time: 274.66s


100%|██████████| 6758/6758 [04:34<00:00, 24.64it/s]


Epoch 8, Loss: 0.3816, Time: 274.25s


100%|██████████| 6758/6758 [04:33<00:00, 24.75it/s]


Epoch 9, Loss: 0.3750, Time: 273.10s


100%|██████████| 6758/6758 [04:31<00:00, 24.87it/s]

Epoch 10, Loss: 0.3642, Time: 271.70s





In [37]:
# Save model
model_path = "transformer_seq2seq.pth"
torch.save(model.state_dict(), model_path)
print("Model saved!")


Model saved!


In [38]:
# Inference Function
def translate(model, input_text, source_vocab, target_vocab, max_length=50):
    model.eval()
    tokens = torch.tensor(source_vocab.encode(input_text)).unsqueeze(0).to(device)
    tgt_tokens = torch.tensor([target_vocab.word2idx["<SOS>"]]).unsqueeze(0).to(device)
    
    for _ in range(max_length):
        output = model(tokens, tgt_tokens)
        next_token = output.argmax(2)[:, -1].unsqueeze(0)
        tgt_tokens = torch.cat([tgt_tokens, next_token], dim=1)
        
        if next_token.item() == target_vocab.word2idx["<EOS>"]:
            break
    
    return target_vocab.decode(tgt_tokens.squeeze().tolist())


In [43]:
# Test model with a sample input
sample_input = "create map m from string to integer	map<string, int> m; write code in c"
predicted_output = translate(model, sample_input, source_vocab, target_vocab)
print(f"Input: {sample_input}\nPredicted Code: {predicted_output}")


Input: create map m from string to integer	map<string, int> m; write code in c
Predicted Code: ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x ((x
