In [None]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import matplotlib.pyplot as plt

log_file = "/home/vbertalan/Downloads/gpt2_logs_mini.log"

def read_lines_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Qwen tokenizer e modelo
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Importante para evitar erro de padding

log_templates = read_lines_from_file(log_file)

# Se desejar, adicionar tokens personalizados (opcional)
tokenizer.add_tokens(log_templates)

tokenized_sequences = tokenizer(
    log_templates,
    truncation=True,
    padding=True,
    max_length=2048,  # pode ajustar dependendo do modelo
    return_tensors="pt"
)

# Carrega o modelo com suporte a código remoto
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
model.resize_token_embeddings(len(tokenizer))

class LogSequenceDataset(Dataset):
    def __init__(self, tokenized_sequences):
        self.input_ids = tokenized_sequences['input_ids']
        self.attention_mask = tokenized_sequences['attention_mask']

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

dataset = LogSequenceDataset(tokenized_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.get_input_embeddings().requires_grad_(True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_epochs = 5
patience = 2
best_loss = float('inf')
epochs_without_improvement = 0
train_losses = []

with open("training_log.txt", "w") as log_file:
    try:
        for epoch in range(max_epochs):
            model.train()
            total_loss = 0
            loop = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{max_epochs}")

            for batch in loop:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                optimizer.zero_grad()

                outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                loop.set_postfix(loss=loss.item())
                log_file.write(f"Epoch {epoch + 1}, Batch Loss: {loss.item():.4f}\n")

            average_loss = total_loss / len(dataloader)
            train_losses.append(average_loss)
            log_file.write(f"Epoch {epoch + 1} - Average Loss: {average_loss:.4f}\n")

            if average_loss < best_loss:
                best_loss = average_loss
                epochs_without_improvement = 0
                model.save_pretrained("fine_tuned_intermediate")
                tokenizer.save_pretrained("fine_tuned_intermediate")
            else:
                epochs_without_improvement += 1

            if epochs_without_improvement >= patience:
                print("Early stopping.")
                break

    except Exception as e:
        print(f"Erro: {e}")
        log_file.write(f"Erro: {e}\n")

# Salva modelo final
model.save_pretrained("fine_tuned_qwen2.5_final")
tokenizer.save_pretrained("fine_tuned_qwen2.5_final")

# Plota a curva de perda
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, marker='o')
plt.title('Training Loss Trend')
plt.xlabel('Epochs')
plt.ylabel('Average Loss')
plt.grid()
plt.savefig("fine_tuned_qwen2.5_final/loss_plot.png")
plt.close()

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Epoch 1/20:   0%|          | 0/326048 [00:00<?, ?it/s]

: 

In [1]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
import csv
import matplotlib.pyplot as plt

# Define the path for the template file
log_file = "/home/vbertalan/Downloads/gpt2_logs_mini.log"

# Auxiliary function to read lines from a raw log file
def read_lines_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Step 1: Load GPT-2 tokenizer and add custom log templates
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
log_templates = read_lines_from_file(log_file)

# Add log templates as tokens to the tokenizer
tokenizer.add_tokens(log_templates)

# Set padding token to EOS
tokenizer.pad_token = tokenizer.eos_token

# Step 2: Prepare log sequences directly from log templates
sequences = log_templates  # Use log templates as sequences

# Tokenize sequences
tokenized_sequences = tokenizer(
    sequences,
    truncation=True,
    padding=True,
    max_length=1024,
    return_tensors="pt"
)

# Step 3: Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Step 4: Prepare Dataset and DataLoader for training
class LogSequenceDataset(Dataset):
    def __init__(self, tokenized_sequences):
        self.input_ids = tokenized_sequences['input_ids']
        self.attention_mask = tokenized_sequences['attention_mask']

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# Create the dataset and dataloader
dataset = LogSequenceDataset(tokenized_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 5: Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 6: Enable gradient updates on embeddings
model.get_input_embeddings().requires_grad_(True)

# Step 7: Train the model (Continual Pretraining)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using {} for training".format(device))

max_epochs = 3  # Maximum number of epochs
patience = 2  # Number of epochs to wait for improvement
best_loss = float('inf')  # Initialize best loss to infinity
epochs_without_improvement = 0  # Counter for epochs without improvement

# Initialize lists to store losses
train_losses = []

# Open log file to record epoch losses
with open("training_log.txt", "w") as log_file:
    try:
        for epoch in range(max_epochs):
            model.train()
            total_loss = 0
            loop = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{max_epochs}", leave=True)

            for batch in loop:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                optimizer.zero_grad()

                # Forward pass (language modeling task)
                outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
                loss = outputs.loss

                # Backward pass
                loss.backward()
                optimizer.step()

                # Accumulate loss
                total_loss += loss.item()
                loop.set_postfix(loss=loss.item())

                # Frequent logging after each batch
                log_file.write(f"Epoch {epoch + 1}, Batch Loss: {loss.item():.4f}\n")

            # Calculate average loss for the epoch
            average_loss = total_loss / len(dataloader)
            train_losses.append(average_loss)
            log_file.write(f"Epoch {epoch + 1}/{max_epochs} - Average Loss: {average_loss:.4f}\n")
            print(f"Epoch {epoch + 1}/{max_epochs} completed. Average Loss: {average_loss:.4f}")

            # Early stopping logic
            if average_loss < best_loss:
                best_loss = average_loss
                epochs_without_improvement = 0  # Reset counter
                # Save the last intermediate model whenever there is an improvement
                model.save_pretrained("fine_tuned_intermediate")
                tokenizer.save_pretrained("fine_tuned_intermediate")
                print("Intermediate model saved as 'fine_tuned_intermediate'.")
            else:
                epochs_without_improvement += 1

            # Stop training if no improvement for 'patience' epochs
            if epochs_without_improvement >= patience:
                print("Early stopping triggered. No improvement in loss.")
                break

    except Exception as e:
        print(f"An error occurred: {e}")
        log_file.write(f"Training interrupted due to an error: {e}\n")

# Step 8: Save the final fine-tuned model
model.save_pretrained("fine_tuned_gpt2_final")
tokenizer.save_pretrained("fine_tuned_gpt2_final")

print("Fine-tuning completed and final model saved.")

# Step 9: Plotting the loss trend
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, marker='o')
plt.title('Training Loss Trend')
plt.xlabel('Epochs')
plt.ylabel('Average Loss')
plt.xticks(range(1, len(train_losses) + 1))
plt.grid()

# Save the plot as a PNG file
plt.savefig("fine_tuned_gpt2_final/loss_plot.png")
plt.close()  # Close the plot to free memory

Using cpu for training


Epoch 1/3: 100%|██████████| 10/10 [00:06<00:00,  1.54it/s, loss=nan]


Epoch 1/3 completed. Average Loss: nan


Epoch 2/3: 100%|██████████| 10/10 [00:06<00:00,  1.65it/s, loss=nan]


Epoch 2/3 completed. Average Loss: nan
Early stopping triggered. No improvement in loss.
Fine-tuning completed and final model saved.


In [2]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

import csv
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, GPT2LMHeadModel

template_file = "/home/vbertalan/Downloads/gpt2_logs_mini.log_templates.csv"
log_file = "/home/vbertalan/Downloads/gpt2_logs_mini.log"

# Auxiliary function to read lines from a raw log file
def read_lines_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Auxiliary function to extract templates from previous CSV file
def extract_templates(csv_file):
    event_templates = []
    with open(csv_file, mode='r', encoding='utf-8') as arquivo:
        leitor = csv.DictReader(arquivo)
        for linha in leitor:
            if 'EventTemplate' in linha:
                event_templates.append(linha['EventTemplate'])
    return event_templates

# Step 1: Load GPT-2 tokenizer and add custom log templates
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

log_templates =  extract_templates(template_file)

# Add log templates as tokens to the tokenizer
tokenizer.add_tokens(log_templates)

# Set padding token to EOS
tokenizer.pad_token = tokenizer.eos_token

# Step 2: Prepare log sequences (you may add your actual log data here)
sequences = read_lines_from_file(log_file)

# Tokenize sequences
tokenized_sequences = tokenizer(
    sequences,
    truncation=True,
    padding=True,
    max_length=1024,
    return_tensors="pt"
)

# Step 3: Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Step 4: Prepare Dataset and DataLoader for training
class LogSequenceDataset(Dataset):
    def __init__(self, tokenized_sequences):
        self.input_ids = tokenized_sequences['input_ids']
        self.attention_mask = tokenized_sequences['attention_mask']

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# Create the dataset and dataloader
dataset = LogSequenceDataset(tokenized_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 5: Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 6: Enable gradient updates on embeddings
model.get_input_embeddings().requires_grad_(True)

# Step 7: Train the model (Continual Pretraining)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_epochs = 1  # Maximum number of epochs
patience = 3  # Number of epochs to wait for improvement
best_loss = float('inf')  # Initialize best loss to infinity
epochs_without_improvement = 0  # Counter for epochs without improvement

# Initialize lists to store losses
train_losses = []

# Open log file to record epoch losses
with open("training_log.txt", "w") as log_file:
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0
        loop = tqdm(dataloader, leave=True)

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()

            # Forward pass (language modeling task)
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        # Calculate average loss for the epoch
        average_loss = total_loss / len(dataloader)
        train_losses.append(average_loss)
        log_file.write(f"Epoch {epoch + 1}/{max_epochs} - Average Loss: {average_loss:.4f}\n")
        print(f"Epoch {epoch + 1}/{max_epochs} completed. Average Loss: {average_loss:.4f}")

        # Early stopping logic
        if average_loss < best_loss:
            best_loss = average_loss
            epochs_without_improvement = 0  # Reset counter
            # Save the intermediate model whenever there is an improvement
            model.save_pretrained("fine_tuned_intermediate")
            tokenizer.save_pretrained("fine_tuned_intermediate")
            print("Intermediate model saved as 'fine_tuned_intermediate'.")
        else:
            epochs_without_improvement += 1

        # Stop training if no improvement for 'patience' epochs
        if epochs_without_improvement >= patience:
            print("Early stopping triggered. No improvement in loss.")
            break

# Step 8: Save the final fine-tuned model
model.save_pretrained("fine_tuned_gpt2_final")
tokenizer.save_pretrained("fine_tuned_gpt2_final")

print("Fine-tuning completed and final model saved.")

# Step 9: Plotting the loss trend
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, marker='o')
plt.title('Training Loss Trend')
plt.xlabel('Epochs')
plt.ylabel('Average Loss')
plt.xticks(range(1, len(train_losses) + 1))
plt.grid()

# Save the plot as a PNG file
plt.savefig("fine_tuned_gpt2_final/loss_plot.png")
plt.close()  # Close the plot to free memory


100%|██████████| 10/10 [00:06<00:00,  1.65it/s, loss=nan]


Epoch 1/1 completed. Average Loss: nan
Fine-tuning completed and final model saved.


In [3]:
import csv
import torch
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel

template_file = "/home/vbertalan/Downloads/gpt2_logs_mini.log_templates.csv"
log_file = "/home/vbertalan/Downloads/gpt2_logs_mini.log"

# Auxiliary function to read lines from a raw log file
def read_lines_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Auxiliary function to extract templates from previous CSV file
def extract_templates(csv_file):
    event_templates = []
    with open(csv_file, mode='r', encoding='utf-8') as arquivo:
        leitor = csv.DictReader(arquivo)
        for linha in leitor:
            if 'EventTemplate' in linha:
                event_templates.append(linha['EventTemplate'])
    return event_templates


# Step 1: Load GPT-2 tokenizer and add custom log templates
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
log_templates =  extract_templates(template_file)

# Add log templates as tokens to the tokenizer
tokenizer.add_tokens(log_templates)

# Set padding token to EOS
tokenizer.pad_token = tokenizer.eos_token

# Step 2: Prepare log sequences (you may add your actual log data here)
sequences = read_lines_from_file(log_file)

# Tokenize sequences
tokenized_sequences = tokenizer(
    sequences,  # List of sentences
    truncation=True,
    padding=True,  # Padding all sequences to the same length
    max_length=128,  # Set max length for input sequences
    return_tensors="pt"  # Return PyTorch tensors
)

# Step 3: Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize the model's embeddings to include the new tokens
model.resize_token_embeddings(len(tokenizer))

# Step 4: Prepare Dataset and DataLoader for training
class LogSequenceDataset(Dataset):
    def __init__(self, tokenized_sequences):
        self.input_ids = tokenized_sequences['input_ids']
        self.attention_mask = tokenized_sequences['attention_mask']
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# Create the dataset and dataloader
dataset = LogSequenceDataset(tokenized_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 5: Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 6: Enable gradient updates on embeddings
model.get_input_embeddings().requires_grad_(True)

# Step 7: Train the model (Continual Pretraining)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3  # Adjust based on your training needs

for epoch in range(epochs):
    model.train()
    loop = tqdm(dataloader, leave=True)
    
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass (language modeling task)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        
        # Update the model's parameters
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1}/{epochs} completed.")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

print("Fine-tuning completed and model saved.")


100%|██████████| 10/10 [00:06<00:00,  1.65it/s, loss=nan]


Epoch 1/3 completed.


100%|██████████| 10/10 [00:05<00:00,  1.73it/s, loss=nan]


Epoch 2/3 completed.


100%|██████████| 10/10 [00:05<00:00,  1.73it/s, loss=nan]


Epoch 3/3 completed.
Fine-tuning completed and model saved.


In [None]:
import torch
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

# === Step 0: Login on Hugging Face (coloque seu token aqui se necessário) ===
# Step 0: Code for reading HuggingFace token
def get_huggingface_token():
    f = open("huggingface_token.txt", "r")
    return (f.read())

login(token=get_huggingface_token())  # Descomente se preferir login via código

# === Step 1: Define LLaMA 2 model ===
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

# === Step 2: Add custom tokens ===
log_templates = [
    "Error encountered in module X",
    "Error encountered in",
    "Unexpected behavior in network communication",
    "System rebooted successfully",
    "Segmentation fault in memory allocation"
]
tokenizer.add_tokens(log_templates)

# === Step 3: Prepare sequences ===
sequences = [
    "Error encountered in module X The weather is great today. I am working hard.",
    "The system rebooted successfully after the error."
]

tokenized_sequences = tokenizer(
    sequences,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

# === Step 4: Load LLaMA 2 model ===
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_auth_token=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

# === Step 5: Resize embeddings to include new tokens ===
model.resize_token_embeddings(len(tokenizer))

# === Step 6: Prepare Dataset & Dataloader ===
class LogSequenceDataset(Dataset):
    def __init__(self, tokenized_sequences):
        self.input_ids = tokenized_sequences['input_ids']
        self.attention_mask = tokenized_sequences['attention_mask']
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

dataset = LogSequenceDataset(tokenized_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# === Step 7: Optimizer ===
optimizer = AdamW(model.parameters(), lr=5e-5)

# === Step 8: Train the model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)
    
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

# === Step 9: Save fine-tuned model ===
model.save_pretrained("fine_tuned_llama2")
tokenizer.save_pretrained("fine_tuned_llama2")
print("✅ Fine-tuning completed and model saved.")


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

: 

In [None]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm

template_file = ""
log_file = "gpt2_logs.log"

# Função auxiliar para ler linhas de um arquivo como lista
def read_lines_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Step 1: Load GPT-2 tokenizer and add custom log templates
tokenizer = GPT2Tokenizer.from_pretrained("llama2-7b")

# Carregar log templates de arquivo externo
log_templates = read_lines_from_file(template_file)

# Adicionar os log templates como novos tokens no tokenizer
tokenizer.add_tokens(log_templates)

# Definir token de padding
tokenizer.pad_token = tokenizer.eos_token

# Step 2: Carregar sequências de log de arquivo externo
sequences = read_lines_from_file(log_file)

# Tokenizar as sequências
tokenized_sequences = tokenizer(
    sequences,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

# Step 3: Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Step 4: Dataset e DataLoader
class LogSequenceDataset(Dataset):
    def __init__(self, tokenized_sequences):
        self.input_ids = tokenized_sequences['input_ids']
        self.attention_mask = tokenized_sequences['attention_mask']
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

dataset = LogSequenceDataset(tokenized_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 5: Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 6: Enable embedding updates
model.get_input_embeddings().requires_grad_(True)

# Step 7: Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3

for epoch in range(epochs):
    model.train()
    loop = tqdm(dataloader, leave=True)
    
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1}/{epochs} completed.")

# Salvar modelo e tokenizer
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

print("Fine-tuning completed and model saved.")

Epoch 1/3: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s, loss=105]
Epoch 2/3: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s, loss=77]
Epoch 3/3: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s, loss=61.3]


Fine-tuning completed and model saved.


In [11]:
from transformers import GPT2Tokenizer

# === CONFIGURAÇÃO ===
LOG_FILE_PATH = "/home/vbertalan/Downloads/gpt2_logs.log"  # Caminho para seu arquivo .log

# === Lê o arquivo de logs ===
with open(LOG_FILE_PATH, "r", encoding="utf-8") as f:
    log_lines = [line.strip() for line in f if line.strip()]

# === Inicializa o tokenizer GPT-2 ===
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# === Calcula os comprimentos em tokens ===
token_lens = [len(tokenizer(log)["input_ids"]) for log in log_lines]

# === Exibe estatísticas ===
print(f"Total de logs: {len(token_lens)}")
print(f"Comprimento médio: {sum(token_lens)/len(token_lens):.2f} tokens")
print(f"Comprimento máximo: {max(token_lens)} tokens")


Token indices sequence length is longer than the specified maximum sequence length for this model (1026 > 1024). Running this sequence through the model will result in indexing errors


Total de logs: 652096
Comprimento médio: 156.45 tokens
Comprimento máximo: 1831 tokens
