In [1]:
import pandas as pd

df = pd.read_json("hf://datasets/Abirate/english_quotes/quotes.jsonl", lines=True)


In [2]:
df.head()
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508 entries, 0 to 2507
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   quote   2508 non-null   object
 1   author  2508 non-null   object
 2   tags    2508 non-null   object
dtypes: object(3)
memory usage: 58.9+ KB


quote     0
author    0
tags      0
dtype: int64

In [3]:
import pandas as pd

# Drop rows with missing values
df.dropna(inplace=True)

# Lowercasing for consistency
df['quote'] = df['quote'].str.lower()
df['author'] = df['author'].str.lower()
df['tags'] = df['tags'].apply(lambda x: [tag.lower() for tag in x])

# Combine text for embedding
df['combined'] = df.apply(lambda x: f"{x['quote']} [AUTHOR: {x['author']}] [TAGS: {', '.join(x['tags'])}]", axis=1)


In [4]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Load model
model1 = SentenceTransformer('all-MiniLM-L6-v2')







modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
from sentence_transformers import InputExample

examples = []
for idx, row in df.iterrows():
    query = f"quotes about {', '.join(row['tags'])} by {row['author']}"
    examples.append(InputExample(texts=[query, row['combined']]))

# Create DataLoader
train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model1)


In [6]:
model1.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,  # Increase to 2-4 for better results
    show_progress_bar=True
)

# Save the model
model1.save("fine_tuned_quote_model")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1021
1000,0.0164
1500,0.0052


In [7]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB 162.5 kB/s eta 0:01:33
   ---------------------------------------- 0.0/15.0 MB 186.2 kB/s eta 0:01:21
   ---------------------------------------- 0.0/15.0 MB 217.9 kB/s eta 0:01:09
   ---------------------------------------- 0.1/15.0 MB 479.3 kB/s eta 0:00:32
    --------------------------------------- 0.3/15.0 MB 962.4 kB/s eta 0:00:16
   - -------------------------------------- 0.5/15.0 MB 1.7 MB/s eta 0:00:09
   -- ------------------------------------- 0.9/15.0 MB 2.3 MB/s eta 0:00:07
   --- ---


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import faiss
import numpy as np

# Encode data
embeddings = model1.encode(df['combined'].tolist(), show_progress_bar=True)
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [10]:
def retrieve(query, top_k=5):
    query_emb = model1.encode([query])
    distances, indices = index.search(query_emb, top_k)
    results = []
    for i in indices[0]:
        results.append({
            "quote": df.iloc[i]['quote'],
            "author": df.iloc[i]['author'],
            "tags": df.iloc[i]['tags'],
            "similarity_score": float(distances[0][list(indices[0]).index(i)])
        })
    return results


In [11]:
import google.generativeai as genai

# Configure Gemini
genai.configure(api_key="AIzaSyC4G9dJu7fqD6iLrWtROSL_PWl0wGNEffc")

model = genai.GenerativeModel("gemini-1.5-flash")

def generate_answer(query, context_quotes):
    context = "\n".join([
        f"Quote: {q['quote']} (Author: {q['author']}, Tags: {q['tags']})"
        for q in context_quotes
    ])

    prompt = f"""You are a semantic quote assistant. Use the following quotes to answer the query.

QUERY: {query}

CONTEXT:
{context}

STRUCTURED JSON OUTPUT:
"""

    response = model.generate_content(prompt)
    return response.text


In [None]:
!pip install google-generativeai



In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m119.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [12]:
import streamlit as st

st.title("Semantic Quote Retriever")

query = st.text_input("Enter your query (e.g., quotes about courage by women authors):")

if query:
    retrieved = retrieve(query)
    st.json(retrieved)

    if st.button("Generate Structured Answer"):
        answer = generate_answer(query, retrieved)
        st.markdown("### Answer")
        st.markdown(answer)

2025-05-30 16:41:32.134 
  command:

    streamlit run c:\Users\lovyv\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [14]:
!streamlit run c:\Users\lovyv\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py

^C
