In [7]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling

class CodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = [f"<CODE> {line.strip()} <CODE>" for line in data]  # Add special token at the beginning and end of each code snippet
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        tokens = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        tokens = {key: value.squeeze(0) for key, value in tokens.items()}
        return tokens

# Load data from txt file
with open('data4.txt', 'r') as f:
    new_data = f.readlines()

# Initialize the GPT-2 tokenizer and dataset
tokenizer = GPT2Tokenizer.from_pretrained('trained_4_model')
tokenizer.pad_token = tokenizer.eos_token
max_length = 128  # Reduced max_length
dataset = CodeDataset(new_data, tokenizer, max_length=max_length)

# Create a DataLoader for batching
batch_size = 2  # Reduced batch_size
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

# Load the pre-trained GPT-2 model and set it up for training
model = GPT2LMHeadModel.from_pretrained('trained_4_model')

# Check if multiple GPUs are available and wrap the model with nn.DataParallel
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = torch.nn.DataParallel(model)

model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and training loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

epochs = 3
gradient_accumulation_steps = 4  # Gradient accumulation
accumulated_loss = 0.0
for epoch in range(epochs):
    for step, batch in enumerate(dataloader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss

        loss.backward()

        # Gradient accumulation
        accumulated_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            print(f"Step: {step + 1}, Loss: {accumulated_loss / gradient_accumulation_steps}")
            accumulated_loss = 0.0

    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

# Save the trained model
model.save_pretrained("trained_5_model")


Step: 4, Loss: 4.693137288093567
Step: 8, Loss: 4.348484635353088
Step: 12, Loss: 4.880219101905823
Step: 16, Loss: 4.7571399211883545
Step: 20, Loss: 4.254218816757202
Step: 24, Loss: 4.083686947822571
Step: 28, Loss: 4.018662750720978
Step: 32, Loss: 3.607541024684906
Step: 36, Loss: 3.428359806537628
Step: 40, Loss: 4.054280579090118
Step: 44, Loss: 3.3034571409225464
Step: 48, Loss: 3.372847080230713
Step: 52, Loss: 3.383326530456543
Step: 56, Loss: 3.4874064326286316
Step: 60, Loss: 3.0335362553596497
Step: 64, Loss: 3.1241634488105774
Step: 68, Loss: 2.5820778906345367
Step: 72, Loss: 2.5536625683307648
Step: 76, Loss: 2.668115973472595
Step: 80, Loss: 2.4756160974502563
Epoch: 1, Loss: 1.8110204935073853
Step: 4, Loss: 4.987349063158035
Step: 8, Loss: 2.8676598072052
Step: 12, Loss: 3.0661474466323853
Step: 16, Loss: 2.2211397290229797
Step: 20, Loss: 3.146446645259857
Step: 24, Loss: 3.0171623826026917
Step: 28, Loss: 2.4846193194389343
Step: 32, Loss: 2.274779498577118
Step: 3

In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the original GPT-2 tokenizer
original_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Save the tokenizer files to the "trained_gpt2_model" folder
original_tokenizer.save_pretrained("trained_7_model")

# Load the saved model
model = GPT2LMHeadModel.from_pretrained("trained_7_model")
model.eval()

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("trained_7_model")

# Function to generate code based on a prompt
def generate_code(prompt, max_length=None, num_return_sequences=3, min_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
    generated_sequences = []
    
    for i in range(num_return_sequences):
        output_sequences = []
        while True:
            output_sequence = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                no_repeat_ngram_size=2,
                do_sample=True,
                top_k=50, # You can experiment with this value
                top_p=0.95,
                temperature=1.0, # You can experiment with this value
                pad_token_id=tokenizer.eos_token_id,
            )
            generated_sequence = output_sequence[0].tolist()
            text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
            
            if len(text) >= min_length:
                output_sequences.append(text)
                break
        
        generated_sequences.append(output_sequences)
        
    return generated_sequences

# Test the function with a prompt
prompt = "Write a Python function that calculates the sum of two numbers"
generated_code = generate_code(prompt, max_length=200, min_length=30)
print(generated_code[0][0])


Write a Python function that calculates the sum of two numbers in a list:

I know that this is a very complex way to sum the number of possible numbers using a number.
Here is an example:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
There is really a lot of interesting code:<|endoftext|>
