In [None]:
import os
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

In [None]:
import kagglehub

def download_and_get_csv_path(dataset_name, version_path="/root/.cache/kagglehub/datasets"):

    dataset_path = kagglehub.dataset_download(dataset_name)

    csv_file = None
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".csv"):
                csv_file = os.path.join(root, file)
                break
        if csv_file:
            break

    return csv_file

def load_data(dataset_path):
    data = pd.read_csv(dataset_path)
    return data

try:
    dataset_name = "harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows"
    csv_path = download_and_get_csv_path(dataset_name)
    data = load_data(csv_path)
    print(f"Data loaded successfully. Number of records: {len(data)}")
except FileNotFoundError as e:
    print(f"Error: FileNotFound {e}")
except Exception as e:
    print(f"Error: {e}")


Downloading from https://www.kaggle.com/api/v1/datasets/download/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows?dataset_version_number=1...


100%|██████████| 175k/175k [00:00<00:00, 45.8MB/s]

Extracting files...
Data loaded successfully. Number of records: 1000





In [None]:
df = data

df = df.dropna(subset=["Series_Title", "Genre", "Director", "Overview"])

In [None]:
def format_example(row):
    return f"Title: {row['Series_Title']}\nGenres: {row['Genre']}\nDirector: {row['Director']}\Overview: {row['Overview']}\n\n"

texts = df.apply(format_example, axis=1).tolist()

with open("fine_tuning_data.txt", "w", encoding="utf-8") as f:
    f.write("".join(texts))

class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size=512):
        with open(file_path, encoding="utf-8") as f:
            text = f.read()
        tokenized_text = tokenizer.encode(text)
        self.examples = []
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenized_text[i:i + block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

block_size = 512
train_dataset = TextDataset(tokenizer, "fine_tuning_data.txt", block_size=block_size)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    learning_rate=5e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

if __name__ == "__main__":
    print("Rozpoczęcie fine-tuning modelu GPT-2...")
    trainer.train()
    trainer.save_model("./gpt2-finetuned")
    print("Fine-tuning zakończony. Model zapisany w katalogu './gpt2-finetuned'.")


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

tokenizer.save_pretrained("./gpt2-finetuned")


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [None]:
def load_finetuned_model(model_path: str):
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()
    return model, tokenizer

def generate_movie_description(model, tokenizer, title: str, genres: str, director: str, max_new_tokens: int = 200) -> str:
    prompt = f"Title: {title}\nGenres: {genres}\nDirector: {director}\nOverview:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=len(input_ids[0]) + max_new_tokens,
            temperature=0.8,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if "Overview:" in generated_text:
        description = generated_text.split("Overview:")[-1].strip()
    else:
        description = generated_text.strip()

    return description

In [None]:
model_path = "./gpt2-finetuned"
model_tuned, tokenizer_tuned = load_finetuned_model(model_path)


In [None]:
title = "The Matrix"
genres = "Action, Sci-Fi"
director = "Lana Wachowski, Lilly Wachowski"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated description:
A young boy grows up in a small town with a mysterious older brother, but soon discovers a strange, alien world that he must flee, and must


In [None]:
title = "The Matrix"
genres = "Action, Sci-Fi"
director = "Lana Wachowski, Lilly Wachowski"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
When his friends and family are murdered, a priest decides to use his personal powers to bring about the exorcism that would have


In [None]:
title = "The Godfather"
genres = "Crime, Drama"
director = "Francis Ford Coppola"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
The mysterious, yet highly skilled, Thing must rescue an old friend and a young woman in a battle to save the world from the demonic monster, while the Thing tries to change the world and destroy the world's


In [None]:
title = "The Godfather"
genres = "Crime, Drama"
director = "Francis Ford Coppola"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
A young aristocrat, the son of a baron, is accused of murder and is sentenced to life imprisonment.

Title


In [None]:
title = "The Dark Knight"
genres = "Action, Crime, Drama"
director = "Christopher Nolan"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
The first four hours of The Dark Knight Rises take place in the year after the events of


In [None]:
title = "The Dark Knight"
genres = "Action, Crime, Drama"
director = "Christopher Nolan"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
A man is taken to court for painting his own faces. He is tried and convicted of murder but acquitted.

Title: The Godfather
Genres: Action, Comedy, Family
Director:


In [None]:
title = "The Lord of the Rings: The Return of the King"
genres = "Action, Adventure, Drama"
director = "Peter Jackson"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
During the Battle of the Great Hall of the East Room, Gandalf and the other hobbits are surrounded by the armies of Sauron, and as they fight, Bilbo and his companions find themselves fighting on the high ground.

Title: The Hobbit: An Unexpected Journey
Genres: Adventure, Adventure, Drama
Director: Steven


In [None]:
title = "How to Train Your Dragon"
genres = "Animation, Action, Comedy"
director = "Chris Sanders, Dean DeBlois"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
In 1986, a lawyer, a political activist, and a business partner are accused of a scam that involves a scheme to help wealthy people


In [None]:
title = "How to Train Your Dragon"
genres = "Animation, Action, Comedy"
director = "Chris Sanders, Dean DeBlois"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
A young man with a reputation for evil comes to a rescue when a rogue detective who seems to have a connection with evil falls in love with the young woman.

Title: The Lord of the Rings: The Return of the King
Genres: Adventure, Fantasy, Sci-Fi
Director: Ron


In [None]:
title = "Goodfellas"
genres = "Biography, Crime, Drama"
director = "Martin Scorseses"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
A man learns about life in a small town and becomes obsessed with finding out what's going on.


In [None]:
title = "How to Lose a Guy in 10 Days"
genres = "Comedy, Romance"
director = "Donald Petrie"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
As the title would suggest, the story of a young man who becomes the new ruler of the Roman Empire and embarks on a bloody mission to save his father from being killed by the Roman


In [None]:
title = "The Joker"
genres = "Crime, Drama, Thriller"
director = "Todd Phillips"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
The movie's story of a young woman who is sold by a wealthy banker to a wealthy French banker and is told the story of a man who has been in debt for years and who has been left behind.


In [None]:
title = "The Joker"
genres = "Crime, Drama, Thriller"
director = "Todd Phillips"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
A group of musicians discover that they are the true heirs of one of their favorite musicians.

Title: A New York City Story
Gen


In [None]:
title = "Meet Joe Black"
genres = "Romance, Fantasy"
director = "Martin Brest"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
The story of Christmas and the Christmas car accident that left a 12-year-old boy with a broken leg.

Title: The Hobbit: The Desolation of Smaug
Genres: Adventure, Fantasy, Sci-Fi
Director: Peter Jackson\Overview


In [None]:
title = "Harry Potter and the Philosopher's Stone"
genres = "Family, Fantasy "
director = "Chris Columbus"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
Two friends are asked to help Harry and Ron get through the summer holidays and prevent a child from


In [None]:
title = "Once Upon a Time... in Hollywood"
genres = "Comedy, Western"
director = "Quentin Tarantino"

description = generate_movie_description(model_tuned, tokenizer_tuned, title, genres, director)
print("Generated description:")
print(description)

Generated description:
A young man is forced to leave his home to search for his father's body when a robbery is committed.

Title: The Girl with the Dragon Tattoo
Genres:
