## Imports

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer
import nltk
import os
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt', quiet=True)
nltk.data.path.append('/Users/trevordoucet/nltk_data')
os.environ['NLTK_DATA'] = '/Users/trevordoucet/nltk_data'

## Loading Pre-Trained Model & Training

Link here

In [3]:
df = pd.read_excel('../data/final_descriptions.xlsx')
len(df)

39450

Training setup

In [7]:
# Model
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Huggingface df
dataset = Dataset.from_pandas(df)

# Splitting the dataset into train, validation, and test sets
split_dataset = dataset.train_test_split(test_size=0.2)
test_validation = split_dataset['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    "train": split_dataset['train'],
    "validation": test_validation['train'],
    "test": test_validation["test"]
})

# Preprocessing
def preprocess(examples):
    model_inputs = tokenizer(examples["two_sentence_summary"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=['two_sentence_summary'])

# Training arguments
training_arguments = TrainingArguments(
    output_dir="./results_overview",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    logging_dir="./logs_overview",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_strategy='steps',
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to=[]
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)

results = trainer.evaluate(tokenized_datasets["test"])
print("Evaluation results:", results)

eval_loss = results['eval_loss']
perplexity = math.exp(eval_loss)
print("Perplexity:", perplexity)

Map: 100%|██████████| 31560/31560 [00:07<00:00, 4284.46 examples/s]
Map: 100%|██████████| 3945/3945 [00:01<00:00, 3604.91 examples/s]
Map: 100%|██████████| 3945/3945 [00:01<00:00, 3508.24 examples/s]
  trainer = Trainer(


Evaluation results: {'eval_loss': 3.288257598876953, 'eval_model_preparation_time': 0.0022, 'eval_runtime': 146.5729, 'eval_samples_per_second': 26.915, 'eval_steps_per_second': 13.461}
Perplexity: 26.79613334341583


## Save

In [16]:
trainer.save_model("../saved_overview_model")

## Summary Output

In [4]:
# Model load
model_name = "../saved_overview_model"
tokenizer = BartTokenizer.from_pretrained(model_name, local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(model_name, local_files_only=True)



In [None]:
# Movie concepts
prompts = [
    "A kid's imaginary friend returns years later with a real body and a warning.",
    "An android nanny begins to question her programming after reading poetry.",
    "A haunted ship appears off the coast every full moon—and takes one passenger."
]

for prompt in prompts:
    # Encoding
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=256, truncation=True)
    input_ids = input_ids.to(model.device)
    
    # Generate text
    outputs = model.generate(
        input_ids,
        max_new_tokens=60,
        min_length=40,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        temperature=0.01
    )

    # Decoding
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Remove repeated pieces
    split_prompt = prompt.split(":", 1)  
    if len(split_prompt) == 2:
        story_part = split_prompt[1].strip() 
        if generated_text.startswith(story_part):
            generated_text = generated_text[len(story_part):].strip()

    print("Prompt:")
    print(prompt)
    print("Generated Text:")
    print(generated_text)
    print("\n" + "-"*50 + "\n")

Prompt:
Generated Text:

--------------------------------------------------

Prompt:
An android nanny begins to question her programming after reading poetry.
Generated Text:
An android nanny begins to question her programming after reading poetry. The android is a nannie who works for a family of five. She is also a teacher and a nurse. Her name is Nanny.

--------------------------------------------------

Prompt:
A haunted ship appears off the coast every full moon—and takes one passenger.
Generated Text:
A haunted ship appears off the coast every full moon. The ship takes one passenger. It's a ghost ship. But it's not dangerous. Just look at the pictures. They're not scary.

--------------------------------------------------



## Parameter loop

In [7]:
prompts_df = pd.read_csv("../data/prompts/generated_prompts.csv")

results = []

# Alternative parameters for temperatures, min and max lengths
temperatures = [0.01, 0.5, 0.8]
min_lengths = [40, 50]
max_lengths = [60, 80]

# Loop over each prompt and parameter combination
for prompt in prompts_df["Movie Prompt"]:
    for temp in temperatures:
        for min_len in min_lengths:
            for max_len in max_lengths:
                if max_len <= min_len:
                    continue

                # Encoding
                input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=256, truncation=True)

                # Generate
                outputs = model.generate(
                    input_ids,
                    max_length=max_len,
                    min_length=min_len,
                    do_sample=True,
                    top_k=50,
                    top_p=0.95,
                    no_repeat_ngram_size=2,
                    temperature=temp
                )

                # Decoding
                generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

                results.append({
                    "Prompt": prompt,
                    "Temperature": temp,
                    "Min Length": min_len,
                    "Max Length": max_len,
                    "Generated Summary": generated_text
                })

df_results = pd.DataFrame(results)
file_path = "../output_summaries/generated_movie_overviews.xlsx"

with pd.ExcelWriter(file_path) as writer:
    for (temp, min_len, max_len), group in df_results.groupby(["Temperature", "Min Length", "Max Length"]):
        sheet_name = f"T{temp}_min{min_len}_max{max_len}"
        group.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Results outputting to {file_path}")

Results outputting to ../output_summaries/generated_movie_overviews.xlsx


## Cosine similarity analysis from generated text and training data

In [None]:
generated_df = pd.read_excel("../output_summaries/generated_movie_overviews.xlsx", sheet_name="T0.01_min40_max60")
final_df = pd.read_excel("../data/final_descriptions.xlsx")
final_df["Combined Summary"] = final_df["two_sentence_summary"]

# TF-IDF vectorizer
vectorizer = TfidfVectorizer()
all_texts = list(generated_df["Generated Summary"]) + list(final_df["Combined Summary"])
vectorizer.fit(all_texts)
generated_vectors = vectorizer.transform(generated_df["Generated Summary"])
final_vectors = vectorizer.transform(final_df["Combined Summary"])

# Cosine similarity
similarity_matrix = cosine_similarity(generated_vectors, final_vectors)

# Matches
highest_matches = []
for i, sim_row in enumerate(similarity_matrix):
    max_index = sim_row.argmax()
    max_score = sim_row[max_index]
    movie_title = final_df.iloc[max_index]["title"]
    final_summary = final_df.iloc[max_index]["two_sentence_summary"]
    highest_matches.append({
        "Generated Summary": generated_df.iloc[i]["Generated Summary"],
        "Final Two Sentence Summary": final_summary,
        "Best Match Title": movie_title,
        "Similarity Score": max_score
    })


highest_df = pd.DataFrame(highest_matches)
highest_df.to_excel("../output_summaries/summary_similarity_results.xlsx", index=False)


                                    Generated Summary  \
0   A former astronaut opens a roadside diner to r...   
1   A teenage hacker discovers an AI that claims t...   
2   A mail carrier uncovers decades of unsent love...   
3   Two rival magicians are forced to team up to s...   
4   An elderly librarian finds a book that predict...   
..                                                ...   
90  A child’s imaginary friend recruits them for a...   
91  A brother and sister inherit a hotel where the...   
92  A scientist’s experiment goes wrong, freezing ...   
93  A single mom’s new houseplant starts whisperin...   
94  A man finds a library where each book is someo...   

                           Final Two Sentence Summary  \
0   A rural roadside diner becomes the host of a m...   
1   A mental patient with a heart problem, Xu Lian...   
2   The recovery of a mail bag stolen in a robbery...   
3   Captured by smugglers when he was just a hatch...   
4   In a futuristic city sharp