<a href="https://colab.research.google.com/github/sushantchandelog/Projects/blob/main/Philosophy_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets

In [None]:
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TextDataset
  )
from google.colab import drive
from transformers import pipeline
import os
import time

In [None]:
#google mount drive
drive.mount('/content/drive', force_remount = True)  #it help refreshing the connection from google file and make sure we are wrking on a most current file

#defing the path
folder_path = "/content/drive/MyDrive/cleaned_data"
COMBINED_FILE_PATH = f"{folder_path}/combined_plato.txt"
OUTPUT_DIR = f"{folder_path}/PhilosophyModel"
MODEL_NAME = "gpt2"

print("data folder", folder_path)
print("Combined file Will be", COMBINED_FILE_PATH)
print("model will be saved to", OUTPUT_DIR)

In [None]:
#combining the all nine files
all_files = os.listdir(folder_path)
txt_files =  [f for f in all_files if f.endswith('.txt') and f != "combined_plato.txt"]

print(len(txt_files), "files to combine:", txt_files)

In [None]:
all_text = ""
for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='latin-1') as f:
        all_text += f.read()

    all_text += "\n\n" # Add separation between books


#write the combin text for the new file
with open(COMBINED_FILE_PATH, 'w', encoding='utf-8') as f:
    f.write(all_text)

print("Succesfully combine all file into ", COMBINED_FILE_PATH)

In [None]:
#loading tokenizer and base model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
print("tokenizer model loaded")


In [None]:
#loding the combined datasets
train_dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = COMBINED_FILE_PATH,
    block_size= 128 #this is the chunk size for the text
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)
print("dataset is prepared",len(train_dataset),"text blocks")

In [None]:
#setting up the trainer
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,              # 3 passes over the data is a good start
    per_device_train_batch_size=4,   # Batch size for T4 GPU
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="none"
)
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset
)

In [None]:
#starting the training
start_time = time.time()
trainer.train()
end_time = time.time()

#saving the final model
trainer.save_model()

In [None]:
OUTPUT_DIR  = "/content/drive/MyDrive/cleaned_data/PhilosophyModel"

tokenizer.save_pretrained(OUTPUT_DIR)

In [None]:
#testing the new model
model_from_drive = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
tokenizer_from_drive = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)
plato_generator = pipeline(
    'text-generation',
    model=model_from_drive,
    tokenizer=tokenizer_from_drive
)


In [None]:
prompt = input("Enter you prompt for plato")
print(f"Generating text for prompt: '{prompt}'")
generated_text = plato_generator(
    prompt,
    max_length=150,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

print("\n--- MODEL'S OUTPUT ---")
print(generated_text[0]['generated_text'])
print("---------------------------------")