<a href="https://colab.research.google.com/github/tanvircr7/meh/blob/master/ualr_chatbot_indexing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu nltk



In [None]:
import json
import nltk
nltk.download('punkt')  # Downloads sentence tokenizer
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
INPUT_PATH = "merged_data.json"
MAX_TOKENS_PER_CHUNK = 300  # Target size for each chunk
OUTPUT_PATH = "processed_chunks.json"

def estimate_tokens(text):
    return len(text.split())  # crude estimate: ~1 token per word

def chunk_text_by_tokens(text, max_tokens=300):
  sentences = sent_tokenize(text)
  chunks = []
  current_chunk = []
  token_count = 0

  for sent in sentences:
    tokens = estimate_tokens(sent)

    if token_count + tokens > max_tokens and current_chunk:
      chunks.append(" ".join(current_chunk))
      current_chunk = []
      token_count = 0

    current_chunk.append(sent)
    token_count += tokens

  if current_chunk:
    chunks.append(" ".join(current_chunk))

  return chunks


with open(INPUT_PATH, "r", encoding="utf-8") as f:
  raw_data = json.load(f)

all_chunks = []

for doc_id, item in enumerate(raw_data, start=1):
  text = item.get("cleaned_data", "")
  chunks = chunk_text_by_tokens(text, MAX_TOKENS_PER_CHUNK)

  for chunk_id, chunk in enumerate(chunks, start=1):
    all_chunks.append({
        "id": f"{doc_id}-{chunk_id}",
        "source_doc": doc_id,
        "chunk_id": chunk_id,
        "token_estimate": estimate_tokens(chunk),
        "content": chunk
    })


In [None]:
print(f"Processed {len(raw_data)} documents into {len(all_chunks)} chunks.")
print(f"Avg tokens per chunk: {sum(c['token_estimate'] for c in all_chunks) // len(all_chunks)}")

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)


Processed 57 documents into 83 chunks.
Avg tokens per chunk: 199


In [None]:
import json

# Load the JSON data from file
with open("processed_chunks.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Convert to all_docs format with unique IDs
all_docs = [{"id": i + 1, "content": item["content"]} for i, item in enumerate(raw_data)]

# Print confirmation
print(f"Found {len(all_docs)} documents to embed.")

for val in all_docs:
  # print(val['content'])
  print(len(val['content']))

Found 83 documents to embed.
548
583
1538
2176
2095
1164
2159
2193
1150
1072
2303
911
480
2038
1130
2045
1616
2015
1880
1081
2107
1354
2230
2078
1981
1223
1868
2042
239
1934
1125
274
2245
1903
1595
1702
2037
235
2377
1795
2426
1736
1788
1770
1360
471
2353
530
1488
862
1655
1615
1370
1819
1414
735
1909
2018
965
1320
869
2343
2084
2076
1283
2019
1633
1121
1364
636
1084
2142
832
229
619
1623
516
1225
567
973
2350
2216
1333


In [None]:
import pandas as pd
df = pd.DataFrame(all_docs)
df

Unnamed: 0,id,content
0,1,The University of Arkansas at Little Rock (UA ...
1,2,## UA Little Rock Graduate Certificate Program...
2,3,**Available Programs (Program | College | Coor...
3,4,Here's a summary of the UA Little Rock Early E...
4,5,* **Post-Baccalaureate GPA Requirement:** Mu...
...,...,...
78,79,The Faculty & Staff Resources section of the D...
79,80,This document serves as a comprehensive guide ...
80,81,"This document, **University of Arkansas at Lit..."
81,82,* Integrity and compliance issues (deception...


In [None]:
import json

# Load the JSON data from file
with open("grad-co.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Convert to all_docs format with unique IDs
gradco_info = [{"id": i + 1, "content": item["content"]} for i, item in enumerate(raw_data)]

# Print confirmation
print(f"Found {len(gradco_info)} documents to embed.")

Found 201 documents to embed.


In [None]:
import pandas as pd
exc = pd.read_excel("gradco-excel.xlsx")
exc

Unnamed: 0,Program Code,Concentration,Field,Department,Program,Campus,College,Activity Indicator,Primary Coordinator,PC - Email,...,Backup Coordinator 2,B2 - Email,Backup Coordinator 3,B3 - Email,Backup Coordinator 4,B4 - Email,Assistant,Asst. - Email,Unnamed: 20,Unnamed: 21
0,ACCT-GC,,ACCX,ACCT,Accounting,Main,CB,Inactive,Sonya Premeaux,sfpremeaux@ualr.edu,...,,,,,,,,,,
1,ACCT-MACC,,ACCT,ACCT,Accounting-MACC,Main,CB,Inactive,Sonya Premeaux,sfpremeaux@ualr.edu,...,,,,,,,,,,
2,AEAX-GC,,AEAX,CHPR,Adult Education: Additional Licensure - GC,Main,BH,Inactive,Jennifer Holtz,jkholtz@ualr.edu,...,,,,,,,Alicia Williams,arwilliams@ualr.edu,,
3,AED-MED,,AED,CHPR,Adult Education - MED,Main,BH,Inactive,Jennifer Holtz,jkholtz@ualr.edu,...,April Chatham-Carpenter,axchathamca@ualr.edu,,,,,Alicia Williams,arwilliams@ualr.edu,,
4,APCS-MA,,APCS,APCS,Applied Communication Studies - MA,Main,CH,Active,Bailey Blackburn,boblackburn@ualr.edu,...,April Chatham-Carpenter,axchathamca@ualr.edu,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,,,,,,,,,,,...,,,,,,,,,,
999,,,,,,,,,,,...,,,,,,,,,,
1000,,,,,,,,,,,...,,,,,,,,,,
1001,,,,,,,,,,,...,,,,,,,,,,


In [None]:
exc_cleaned = exc.dropna(how='all')
exc_cleaned = exc_cleaned.reset_index(drop=True)
exc_cleaned

Unnamed: 0,Program Code,Concentration,Field,Department,Program,Campus,College,Activity Indicator,Primary Coordinator,PC - Email,...,Backup Coordinator 2,B2 - Email,Backup Coordinator 3,B3 - Email,Backup Coordinator 4,B4 - Email,Assistant,Asst. - Email,Unnamed: 20,Unnamed: 21
0,ACCT-GC,,ACCX,ACCT,Accounting,Main,CB,Inactive,Sonya Premeaux,sfpremeaux@ualr.edu,...,,,,,,,,,,
1,ACCT-MACC,,ACCT,ACCT,Accounting-MACC,Main,CB,Inactive,Sonya Premeaux,sfpremeaux@ualr.edu,...,,,,,,,,,,
2,AEAX-GC,,AEAX,CHPR,Adult Education: Additional Licensure - GC,Main,BH,Inactive,Jennifer Holtz,jkholtz@ualr.edu,...,,,,,,,Alicia Williams,arwilliams@ualr.edu,,
3,AED-MED,,AED,CHPR,Adult Education - MED,Main,BH,Inactive,Jennifer Holtz,jkholtz@ualr.edu,...,April Chatham-Carpenter,axchathamca@ualr.edu,,,,,Alicia Williams,arwilliams@ualr.edu,,
4,APCS-MA,,APCS,APCS,Applied Communication Studies - MA,Main,CH,Active,Bailey Blackburn,boblackburn@ualr.edu,...,April Chatham-Carpenter,axchathamca@ualr.edu,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,TAXN-GC,,TAXN,ACCT,Taxation,Main,CB,Inactive,Sonya Premeaux,sfpremeaux@ualr.edu,...,,,,,,,,,,
198,TINV-GC,,TINX,IFSC,Technology Innovation - GC,Main,SS,Inactive,Jared Berleant,jdberleant@ualr.edu,...,Michelle Butler,mdbutler1@ualr.edu,,,,,,,,
199,UGOL-ND,,UNDG,GRAD,Undeclared Graduate Online,Online,GS,Active,Paula Baker,pebaker@ualr.edu,...,,,,,,,,,,
200,UNDG-ND,,UNDG,GRAD,Undeclared Graduate,Main,GS,Active,Paula Baker,pebaker@ualr.edu,...,,,,,,,,,,


In [None]:
import pandas as pd
df = pd.DataFrame(gradco_info)
df

Unnamed: 0,id,content
0,1,Program Code: ACCT-GC; Field: ACCX; Department...
1,2,Program Code: ACCT-MACC; Field: ACCT; Departme...
2,3,Program Code: AEAX-GC; Field: AEAX; Department...
3,4,Program Code: AED-MED; Field: AED; Department:...
4,5,Program Code: APCS-MA; Field: APCS; Department...
...,...,...
196,197,Program Code: TAVP-GC; Field: TAVP; Department...
197,198,Program Code: TAXN-GC; Field: TAXN; Department...
198,199,Program Code: TINV-GC; Field: TINX; Department...
199,200,Program Code: UGOL-ND; Field: UNDG; Department...


In [None]:
# ─── index_and_metadata_builder_with_batching.py ──────────────────────────────

import os
import pickle
import numpy as np
import faiss
from google import genai
from google.genai import types

# ─── Config ────────────────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyAilGo_yMe5p87JvyEYcB2r33noHFNApGU"
if not GEMINI_API_KEY:
    raise RuntimeError("Please set GEMINI_API_KEY in your environment.")

EMBED_MODEL       = "models/text-embedding-004"
MAX_BATCH_SIZE    = 100
OUTPUT_INDEX_PATH = "faiss_index.faiss"
OUTPUT_META_PATH  = "doc_metadata.pkl"

# ─── Prepare Client ───────────────────────────────────────────────────────────
client = genai.Client(api_key=GEMINI_API_KEY)

# ─── Load your docs ────────────────────────────────────────────────────────────
# all_docs should be a list of dicts, each with at least "id" and "content" keys.
# e.g. all_docs = [{"id": 1, "content": "First doc ..."}, {"id": 2, "content": "Second ..."}, ...]
print(f"Found {len(all_docs)} documents to embed.")

# ─── Generate embeddings in batches ────────────────────────────────────────────
embeddings: list[list[float]] = []
texts = [d["content"] for d in all_docs]

for i in texts:
  print(len(i))

for start in range(0, len(texts), MAX_BATCH_SIZE):
    batch_texts = texts[start : start + MAX_BATCH_SIZE]
    print(f"Embedding batch {start // MAX_BATCH_SIZE + 1} "
          f"({len(batch_texts)} docs)…")

    resp = client.models.embed_content(
        model=EMBED_MODEL,
        contents=batch_texts,
        config=types.EmbedContentConfig(output_dimensionality=None)
    )
    # resp.embeddings is a list of ContentEmbedding objects
    embeddings.extend([e.values for e in resp.embeddings])

# convert to NumPy array
emb_np = np.array(embeddings, dtype="float32")
print(f"→ Total embeddings shape: {emb_np.shape}")

# ─── Build and save FAISS index ─────────────────────────────────────────────────
dim   = emb_np.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(emb_np)
print(f"→ FAISS index contains {index.ntotal} vectors (dim={dim})")

# write index to disk
faiss.write_index(index, OUTPUT_INDEX_PATH)
print(f"✅ Wrote FAISS index to {OUTPUT_INDEX_PATH}")

# ─── Save metadata ─────────────────────────────────────────────────────────────
with open(OUTPUT_META_PATH, "wb") as f:
    pickle.dump(all_docs, f)
print(f"✅ Wrote document metadata to {OUTPUT_META_PATH}")


Found 83 documents to embed.
548
583
1538
2176
2095
1164
2159
2193
1150
1072
2303
911
480
2038
1130
2045
1616
2015
1880
1081
2107
1354
2230
2078
1981
1223
1868
2042
239
1934
1125
274
2245
1903
1595
1702
2037
235
2377
1795
2426
1736
1788
1770
1360
471
2353
530
1488
862
1655
1615
1370
1819
1414
735
1909
2018
965
1320
869
2343
2084
2076
1283
2019
1633
1121
1364
636
1084
2142
832
229
619
1623
516
1225
567
973
2350
2216
1333
Embedding batch 1 (83 docs)…
→ Total embeddings shape: (83, 768)
→ FAISS index contains 83 vectors (dim=768)
✅ Wrote FAISS index to faiss_index.faiss
✅ Wrote document metadata to doc_metadata.pkl
