In [1]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
   ---------------------------------------- 0.0/491.5 kB ? eta -:--:--
   --- ------------------------------------ 41.0/491.5 kB ? eta -:--:--
   --------- ------------------------------ 112.6/491.5 kB 1.3 MB/s eta 0:00:01
   --------- ------------------------------ 112.6/491.5 kB 1.3 MB/s eta 0:00:01
   --------- ------------------------------ 112.6/491.5 kB 1.3 MB/s eta 0:00:01
   --------------- ---------------------- 194.6/491.5 kB 908.0 



In [1]:
from datasets import load_dataset

dataset = load_dataset("Abirate/english_quotes")

df = dataset['train'].to_pandas()

df.to_csv("english_quotes.csv", index=False)

print(df.head())


                                               quote                 author  \
0     “Be yourself; everyone else is already taken.”            Oscar Wilde   
1  “I'm selfish, impatient and a little insecure....         Marilyn Monroe   
2  “Two things are infinite: the universe and hum...        Albert Einstein   
3                   “So many books, so little time.”            Frank Zappa   
4  “A room without books is like a body without a...  Marcus Tullius Cicero   

                                                tags  
0  [be-yourself, gilbert-perreira, honesty, inspi...  
1  [best, life, love, mistakes, out-of-control, t...  
2  [human-nature, humor, infinity, philosophy, sc...  
3                                     [books, humor]  
4                              [books, simile, soul]  


In [2]:
df = df.dropna(subset=['quote', 'author', 'tags'])

def clean_quote(text):
    return text.strip('“”"').strip()

df['quote'] = df['quote'].apply(clean_quote)
df['quote'] = df['quote'].str.lower()
df['author'] = df['author'].str.strip()
df['tags'] = df['tags'].apply(lambda x: [tag.strip().lower() for tag in x] if isinstance(x, list) else [])

df = df[df['quote'].str.len() <= 250]

df.to_csv("english_quotes_cleaned.csv", index=False)

print("Cleaned data shape:", df.shape)
df.head()


Cleaned data shape: (2162, 3)


Unnamed: 0,quote,author,tags
0,be yourself; everyone else is already taken.,Oscar Wilde,[]
1,"i'm selfish, impatient and a little insecure. ...",Marilyn Monroe,[]
2,two things are infinite: the universe and huma...,Albert Einstein,[]
3,"so many books, so little time.",Frank Zappa,[]
4,a room without books is like a body without a ...,Marcus Tullius Cicero,[]


In [3]:
import pandas as pd

df = pd.read_csv("english_quotes_cleaned.csv")
quotes = df['quote'].tolist()
df.head()

Unnamed: 0,quote,author,tags
0,be yourself; everyone else is already taken.,Oscar Wilde,[]
1,"i'm selfish, impatient and a little insecure. ...",Marilyn Monroe,[]
2,two things are infinite: the universe and huma...,Albert Einstein,[]
3,"so many books, so little time.",Frank Zappa,[]
4,a room without books is like a body without a ...,Marcus Tullius Cicero,[]


In [17]:
df.tail()

Unnamed: 0,quote,author,tags
2157,"stepping onto a brand-new path is difficult, b...",Maya Angelou,[]
2158,morality is simply the attitude we adopt towar...,"Oscar Wilde,",[]
2159,"in life, finding a voice is speaking and livin...",John Grisham,[]
2160,"winter is the time for comfort, for good food ...",Edith Sitwell,[]
2161,silence is so freaking loud,"Sarah Dessen,",[]


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2162 entries, 0 to 2161
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   quote   2161 non-null   object
 1   author  2162 non-null   object
 2   tags    2162 non-null   object
dtypes: object(3)
memory usage: 50.8+ KB


In [20]:
pip install sentence-transformers faiss-cpu

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
quote_embeddings = model.encode(quotes, show_progress_bar=False)
import numpy as np
print("Embedding shape:", np.array(quote_embeddings).shape)


Embedding shape: (2162, 384)


In [6]:
import faiss

dimension = quote_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(quote_embeddings))
print("FAISS index created with", index.ntotal, "quotes")


FAISS index created with 2162 quotes


In [7]:
def search_quotes(query, author_filter=None, k=50):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)
    results = []
    for i in I[0]:
        quote = df.iloc[i]['quote']
        author = df.iloc[i]['author']
        tags = df.iloc[i]['tags']
        if author_filter:
            if author_filter.lower() not in author.lower():
                continue

        results.append({
            "quote": quote,
            "author": author,
            "tags": tags
        })

        if len(results) >= 5:
            break

    if author_filter and len(results) == 0:
        print(f"Not found random - {author_filter}")
        fallback_df = df[df['author'].str.lower().str.contains(author_filter.lower())]
        if not fallback_df.empty:
            fallback = fallback_df.sample(min(5, len(fallback_df)))
            for _, row in fallback.iterrows():
                results.append({
                    "quote": row['quote'],
                    "author": row['author'],
                    "tags": row['tags']
                })
        else:
            print("No quotes author")

    return results


In [8]:
def evaluate_query(query, author=None):
    print(f"\n Query: {query}")
    if author:
        print(f" Author filter: {author}")
    
    results = search_quotes(query, author_filter=author)

    if not results:
        print(" No results found.")
    else:
        for i, res in enumerate(results, 1):
            print(f"\n Result {i}")
            print("Quote :", res['quote'])
            print("Author:", res['author'])
            print("Tags  :", res['tags'])


In [9]:
evaluate_query("hope and strength", author="Helen Keller")



 Query: hope and strength
 Author filter: Helen Keller
Not found random - Helen Keller

 Result 1
Quote : i would rather walk with a friend in the dark, than alone in the light.
Author: Helen Keller
Tags  : []

 Result 2
Quote : when one door of happiness closes, another opens; but often we look so long at the closed door that we do not see the one which has been opened for us.
Author: Helen Keller
Tags  : []

 Result 3
Quote : the best and most beautiful things in the world cannot be seen or even touched. they must be felt with the heart
Author: Helen Keller
Tags  : []

 Result 4
Quote : life is either a daring adventure or nothing at all.
Author: Helen Keller,
Tags  : []


In [10]:
evaluate_query("life is beautiful")




 Query: life is beautiful

 Result 1
Quote : life becomes easier and more beautiful when we can see the good in other people.
Author: Roy T. Bennett
Tags  : []

 Result 2
Quote : where there is love there is life.
Author: Mahatma Gandhi
Tags  : []

 Result 3
Quote : love the life you live.live the life you love.
Author: Bob Marley
Tags  : []

 Result 4
Quote : life is to be enjoyed, not endured
Author: Gordon B. Hinckley
Tags  : []

 Result 5
Quote : the most beautiful experience we can have is the mysterious. it is the fundamental emotion that stands at the cradle of true art and true science.
Author: Albert Einstein,
Tags  : []
