In [1]:
import os
import re
from transformers import TextDataset, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelWithLMHead, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
from datetime import datetime

# Caminho para a base de dados local
dataset_path = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/dataset/dataset_full_preprocessed_labeled.txt'

# Leitura do arquivo
with open(dataset_path, "r") as text_file:
    lines = text_file.readlines()

# Caminhos para os arquivos de destino
train_path = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/train_dataset.txt'
test_path = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/test_dataset.txt'

# Função para criar os arquivos de texto
def build_text_files(data_json, dest_path):
    with open(dest_path, 'w') as f:
        data = ''
        for texts in data_json:
            summary = str(texts).strip()
            summary = re.sub(r"\s", " ", summary)
            data += summary + "  "
        f.write(data)

# Criação dos conjuntos de treinamento e teste
train, test = train_test_split(lines, test_size=0.15)
build_text_files(train, train_path)
build_text_files(test, test_path)
print("Train dataset length: " + str(len(train)))
print("Test dataset length: " + str(len(test)))

# Tokenizer
model_name = "pierreguillou/gpt2-small-portuguese"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Carregamento do dataset
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=128)
    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=128)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return train_dataset, test_dataset, data_collator

# Carregamento do dataset
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

# Definição do diretório do modelo
model_drive_dir = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/saved_model/'

# Criação do diretório do modelo se não existir
if not os.path.exists(model_drive_dir):
    os.makedirs(model_drive_dir)

# Argumentos de treinamento
training_args = TrainingArguments(
    output_dir=model_drive_dir,  # Diretório de saída
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_steps=200,
    save_steps=400,
    warmup_steps=200,
    prediction_loss_only=True,
)

# Carregamento do modelo
def load_model(model_name, model_drive_dir, data_collator, train_dataset, test_dataset):
    model = AutoModelWithLMHead.from_pretrained(model_name)

    return Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

# Criação do modelo
trainer = load_model(model_name, model_drive_dir, data_collator, train_dataset, test_dataset)

# Treinamento do modelo
for i in range(1, 11):  # Iterações de treinamento (de 1 a 10)
    trainer.train()

    # Salvar o modelo após o treinamento
    iteration_model_path = os.path.join(model_drive_dir, f"epoch_{i}")
    trainer.save_model(iteration_model_path)

    # Avaliação do modelo
    metrics = str(trainer.evaluate())
    print(f"Metrics after epoch {i}: {metrics}")

# Geração de notícias
def generate_noticias(model_drive_dir, model_name, qtde_noticias=10):
    noticias = []
    gerador_noticias = pipeline('text-generation', model=model_drive_dir, tokenizer=model_name)

    while len(noticias) < qtde_noticias:
        noticias_geradas = gerador_noticias('  ')[0]['generated_text'].split('.')
        for n in noticias_geradas:
            if len(n.strip()) > 3:
                noticias.append(n.strip())
                print(n.strip())
        print(len(noticias))

    return noticias

# Geração de 5000 notícias
noticias = generate_noticias(model_drive_dir, model_name, 5000)

# Salvar as notícias geradas em um arquivo
date_str = datetime.now().strftime("%Y_%m_%d_%H%M")
file_path = os.path.join(model_drive_dir, f'noticias_geradas_{date_str}.txt')

with open(file_path, 'w') as file:
    for n in noticias:
        file.write(n + '\n')


Train dataset length: 120179
Test dataset length: 21209




pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]



  0%|          | 0/4320 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 7.04 GB, other allocations: 1.75 GB, max allowed: 9.07 GB). Tried to allocate 779.13 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [2]:
import os
import re
from transformers import TextDataset, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelWithLMHead, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
from datetime import datetime

# Função para criar os arquivos de texto
def build_text_files(data_json, dest_path):
    with open(dest_path, 'w') as f:
        data = ''
        for texts in data_json:
            summary = str(texts).strip()
            summary = re.sub(r"\s", " ", summary)
            data += summary + "  "
        f.write(data)

def main():
    # Caminho para a base de dados local
    dataset_path = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/dataset/dataset_full_preprocessed_labeled.txt'

    # Leitura do arquivo
    with open(dataset_path, "r") as text_file:
        lines = text_file.readlines()

    # Caminhos para os arquivos de destino
    train_path = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/train_dataset.txt'
    test_path = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/test_dataset.txt'

    # Criação dos conjuntos de treinamento e teste
    train, test = train_test_split(lines, test_size=0.15)
    build_text_files(train, train_path)
    build_text_files(test, test_path)
    print("Train dataset length:", len(train))
    print("Test dataset length:", len(test))

    # Tokenizer
    model_name = "pierreguillou/gpt2-small-portuguese"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Carregamento do dataset
    train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

    # Definição do diretório do modelo
    model_drive_dir = '/Users/victorgmoreno/repositorios/gerador_noticias_colab/saved_model/'

    # Criação do diretório do modelo se não existir
    if not os.path.exists(model_drive_dir):
        os.makedirs(model_drive_dir)

    # Argumentos de treinamento
    training_args = TrainingArguments(
        output_dir=model_drive_dir,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        eval_steps=200,
        save_steps=400,
        warmup_steps=200,
        prediction_loss_only=True,
    )

    # Carregamento do modelo
    trainer = load_model(model_name, model_drive_dir, data_collator, train_dataset, test_dataset)

    # Treinamento do modelo
    for i in range(1, 11):  # Iterações de treinamento (de 1 a 10)
        print(f"Treinando época {i}")
        trainer.train()

        # Salvar o modelo após o treinamento
        iteration_model_path = os.path.join(model_drive_dir, f"epoch_{i}")
        trainer.save_model(iteration_model_path)

        # Avaliação do modelo
        print(f"Avaliando modelo após a época {i}")
        metrics = str(trainer.evaluate())
        print(f"Metrics after epoch {i}: {metrics}")

    # Geração de notícias
    print("Gerando notícias")
    noticias = generate_noticias(model_drive_dir, model_name, 5000)

    # Salvar as notícias geradas em um arquivo
    print("Salvando notícias geradas")
    date_str = datetime.now().strftime("%Y_%m_%d_%H%M")
    file_path = os.path.join(model_drive_dir, f'noticias_geradas_{date_str}.txt')

    with open(file_path, 'w') as file:
        for n in noticias:
            file.write(n + '\n')

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=128)
    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return train_dataset, test_dataset, data_collator

def load_model(model_name, model_drive_dir, data_collator, train_dataset, test_dataset):
    model = AutoModelWithLMHead.from_pretrained(model_name)
    return Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

def generate_noticias(model_drive_dir, model_name, qtde_noticias=10):
    noticias = []
    gerador_noticias = pipeline('text-generation', model=model_drive_dir, tokenizer=model_name)

    while len(noticias) < qtde_noticias:
        noticias_geradas = gerador_noticias('  ')[0]['generated_text'].split('.')
        for n in noticias_geradas:
            if len(n.strip()) > 3:
                noticias.append(n.strip())
                print(n.strip())
        print(len(noticias))

    return noticias

if __name__ == "__main__":
    main()


Train dataset length: 120179
Test dataset length: 21209
Treinando época 1


  0%|          | 0/4320 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 8.80 GB, other allocations: 244.54 MB, max allowed: 9.07 GB). Tried to allocate 48.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).