In [1]:
# Step 1: Install and Import Required Libraries
from datasets import load_dataset
import pandas as pd
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Abirate/english_quotes dataset from HuggingFace
raw_dataset = load_dataset("Abirate/english_quotes")
df = pd.DataFrame(raw_dataset["train"])


In [3]:
df.head()


Unnamed: 0,quote,author,tags
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"[be-yourself, gilbert-perreira, honesty, inspi..."
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"[best, life, love, mistakes, out-of-control, t..."
2,“Two things are infinite: the universe and hum...,Albert Einstein,"[human-nature, humor, infinity, philosophy, sc..."
3,"“So many books, so little time.”",Frank Zappa,"[books, humor]"
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"[books, simile, soul]"


In [4]:
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())



Missing values:
quote     0
author    0
tags      0
dtype: int64


In [5]:
# Lowercase all textual fields for uniformity
df["quote"] = df["quote"].str.lower()
df["author"] = df["author"].str.lower()
df["tags"] = df["tags"].apply(lambda tags: [t.lower() for t in tags])


In [6]:
# Optional: create a column that merges all text for easier embedding later
df["combined"] = df.apply(lambda row: row["quote"] + " - " + row["author"] + " [" + ", ".join(row["tags"]) + "]", axis=1)


In [7]:
# Save cleaned dataset 
df.to_csv("cleaned_quotes.csv", index=False)


In [8]:
print("\nCleaned dataset info:")
print(df.info())
df.sample(5)[["quote", "author", "tags"]]



Cleaned dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508 entries, 0 to 2507
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   quote     2508 non-null   object
 1   author    2508 non-null   object
 2   tags      2508 non-null   object
 3   combined  2508 non-null   object
dtypes: object(4)
memory usage: 78.5+ KB
None


Unnamed: 0,quote,author,tags
571,“it is not the critic who counts; not the man ...,theodore roosevelt,"[inspirational, politics, presidential]"
1875,"“a wonderful fact to reflect upon, that every ...","charles dickens,","[communication, psychology]"
869,"“books are the plane, and the train, and the r...","anna quindlen,","[books, journey, reading, travel]"
1010,"“laters, baby.”","e.l. james,","[christina-grey, e-l-james, fifty-shades-of-grey]"
846,“let no man pull you so low as to hate him.”,"martin luther king jr.,","[hatred, wisdom]"


In [9]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
import random


In [10]:
# Load the cleaned dataset

df = pd.read_csv("cleaned_quotes.csv")


In [11]:
# Generate synthetic query–quote pairs for training
examples = []
for _, row in df.iterrows():
    tags_str = ", ".join(row["tags"].strip("[]").replace("'", "").split(", "))
    synthetic_query = f"quotes about {tags_str} by {row['author']}"
    examples.append(InputExample(texts=[synthetic_query, row["quote"]]))



In [12]:
# Initialize a pre-trained sentence embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)


In [13]:
# Create a DataLoader for training
dataloader = DataLoader(examples, shuffle=True, batch_size=16)


In [14]:
# Use MultipleNegativesRankingLoss for contrastive fine-tuning
train_loss = losses.MultipleNegativesRankingLoss(model)


In [15]:
# Fine-tune the model
model.fit(
    train_objectives=[(dataloader, train_loss)],
    epochs=1,  # Increase to 2–3 if training longer
    warmup_steps=100
)


                                                                     

Step,Training Loss


In [16]:
model.save("./model/fine_tuned_quote_model")
print("\n✅ Fine-tuned model saved at 'fine_tuned_quote_model'")



✅ Fine-tuned model saved at 'fine_tuned_quote_model'


In [20]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.
