In [8]:
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

In [2]:
def read_py_files(directory):
    code_snippets = []

    for file_name in os.listdir(directory):
        if file_name.endswith('.py'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                code_snippets.append(content)
    
    return code_snippets

directory = 'C:\\Users\\vigop\\OneDrive\\backup 1\\coding\\data\\python-deeplearning\\train'
code_snippets = read_py_files(directory)

In [4]:
def remove_comments(code):
    # Remove single-line comments
    code = re.sub(r'#.*', '', code)
    
    # Remove multi-line comments
    code = re.sub(r"'''[\s\S]*?'''", '', code)
    code = re.sub(r'"""[\s\S]*?"""', '', code)
    
    return code

def remove_empty_lines(code):
    lines = code.split('\n')
    non_empty_lines = [line.strip() for line in lines if line.strip()]
    return '\n'.join(non_empty_lines)

def preprocess_code_snippets(code_snippets):
    preprocessed_snippets = []
    
    for snippet in code_snippets:
        # Remove comments
        snippet = remove_comments(snippet)
        
        # Remove empty lines
        snippet = remove_empty_lines(snippet)
        
        preprocessed_snippets.append(snippet)
    
    return preprocessed_snippets

preprocessed_snippets = preprocess_code_snippets(code_snippets)


In [9]:
# Train the tokenizer
def train_tokenizer(snippets, vocab_size=30000):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.train_from_iterator(snippets, trainer)

    return tokenizer

vocab_size = 30000
tokenizer = train_tokenizer(preprocessed_snippets, vocab_size)

In [7]:
class Seq2SeqModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(Seq2SeqModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)

        return output


In [None]:
# Tokenize the preprocessed snippets
def tokenize_code_snippets(snippets, tokenizer):
    tokenized_snippets = [tokenizer.encode(snippet) for snippet in snippets]
    return tokenized_snippets

# Use the trained tokenizer
tokenized_snippets = tokenize_code_snippets(preprocessed_snippets, tokenizer)

# Create a dataset and dataloader for the tokenized snippets
class CodeDataset(Dataset):
    def __init__(self, tokenized_snippets):
        self.tokenized_snippets = tokenized_snippets

    def __len__(self):
        return len(self.tokenized_snippets)

    def __getitem__(self, idx):
        return self.tokenized_snippets[idx]

dataset = CodeDataset(tokenized_snippets)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

# Use the Seq2SeqModel class
input_size = vocab_size
hidden_size = 256
output_size = vocab_size
num_layers = 1

model = Seq2SeqModel(input_size, hidden_size, output_size, num_layers)

# Train and save the model
def train_and_save_model(model, dataloader, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        running_loss = 0.0

        for i, data in enumerate(dataloader, 0):
            inputs = data.to(device)
            # Implement the logic for obtaining labels for your specific task
            labels = ... # Update this line to obtain appropriate labels

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}')

    print("Finished training")
    torch.save(model.state_dict(), 'trained_model.pth')

train_and_save_model(model, dataloader, num_epochs=10)
