In [31]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling

class CodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        tokens = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        tokens = {key: value.squeeze(0) for key, value in tokens.items()}
        return tokens

# Load data from txt file
with open('data6.txt', 'r') as f:
    data = f.readlines()

# Initialize the GPT-2 tokenizer and dataset
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
max_length = 128  # Reduced max_length
dataset = CodeDataset(data, tokenizer, max_length=max_length)

# Create a DataLoader for batching
batch_size = 2  # Reduced batch_size
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

# Load the pre-trained GPT-2 model and set it up for training
model = GPT2LMHeadModel.from_pretrained('distilgpt2')  # Changed to 'distilgpt2'

# Check if multiple GPUs are available and wrap the model with nn.DataParallel
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = torch.nn.DataParallel(model)

model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and training loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

epochs = 3
gradient_accumulation_steps = 4  # Gradient accumulation
accumulated_loss = 0.0
for epoch in range(epochs):
    for step, batch in enumerate(dataloader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss

        loss.backward()

        # Gradient accumulation
        accumulated_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            print(f"Step: {step + 1}, Loss: {accumulated_loss / gradient_accumulation_steps}")
            accumulated_loss = 0.0

    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

# Save the trained model
model.save_pretrained("trained_7_model")


Step: 4, Loss: 6.238898873329163
Step: 8, Loss: 4.646116673946381
Step: 12, Loss: 4.915012359619141
Step: 16, Loss: 4.767178654670715
Step: 20, Loss: 4.714836835861206
Step: 24, Loss: nan
Step: 28, Loss: nan
Step: 32, Loss: 4.84111225605011
Step: 36, Loss: 4.614004850387573
Step: 40, Loss: 4.24582040309906
Step: 44, Loss: nan
Step: 48, Loss: 4.8868491649627686
Step: 52, Loss: 4.204628109931946
Step: 56, Loss: 4.725884556770325
Step: 60, Loss: nan
Step: 64, Loss: 4.646574258804321
Step: 68, Loss: 4.697717070579529
Step: 72, Loss: 4.597574174404144
Step: 76, Loss: 3.7510640621185303
Step: 80, Loss: 4.9048765897750854
Step: 84, Loss: 4.624587059020996
Step: 88, Loss: nan
Step: 92, Loss: 4.083964765071869
Step: 96, Loss: 4.071129739284515
Step: 100, Loss: 4.622526526451111
Step: 104, Loss: 3.3688693940639496
Step: 108, Loss: 5.453556656837463
Step: 112, Loss: 3.9155210852622986
Step: 116, Loss: 4.29439902305603
Step: 120, Loss: 3.888221800327301
Step: 124, Loss: nan
Step: 128, Loss: nan
St

In [29]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the original GPT-2 tokenizer
original_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Save the tokenizer files to the "trained_gpt2_model" folder
original_tokenizer.save_pretrained("trained_3_model")

# Load the saved model
model = GPT2LMHeadModel.from_pretrained("trained_3_model")
model.eval()

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("trained_3_model")

# Function to generate code based on a prompt
def generate_code(prompt, max_length=None, num_return_sequences=3, min_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
    generated_sequences = []
    
    for i in range(num_return_sequences):
        output_sequences = []
        while True:
            output_sequence = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                no_repeat_ngram_size=2,
                do_sample=True,
                top_k=50, # You can experiment with this value
                top_p=0.95,
                temperature=1.0, # You can experiment with this value
                pad_token_id=tokenizer.eos_token_id,
            )
            generated_sequence = output_sequence[0].tolist()
            text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
            
            if len(text) >= min_length:
                output_sequences.append(text)
                break
        
        generated_sequences.append(output_sequences)
        
    return generated_sequences

# Test the function with a prompt
prompt = "Write a variable that's called StrVigo and has a value of 'hi'"
generated_code = generate_code(prompt, max_length=200, min_length=30)
print(generated_code[0][0])


Write a variable that's called StrVigo and has a value of 'hi'

You can start by using a method, such as:
#!/bin/bash
If you need to use more than one method like:<|endoftext|>
