In [1]:
# SQL Database


# Vector database - is a specialized database for storing vectors, searching vectors, searching vectors

# Vector - simple numerical representyation of a text - [0, 1, 1.2, -1] 

# String > Vector (several ways are there) -> Vector database
# But we will use an Embedding Model

# Embedding, # Embedding Model
# Cosine Similarity Search

# Vector 1, vector 2, Vector 3

# Vector 4



In [2]:
resume_json = {
    "resume_id": 12,
    "extracted_data": {
        "name": "Charly Dolman",
        "email": "email@email.com",
        "phone": "890-555-0401",
        "summary": "High energy Personal Trainer with 8 years experience in personal and group fitness. Proven track record in motivating clients to safely achieve weight loss goals through detailed and effective diet plans. Certified ACE Instructor with CPR training and a talent for developing fitness programs that exceed revenue targets and increase memberships.",
        "skills": [
            "Knowledge in Nutrition",
            "Proficient at sales",
            "Self-motivated",
            "Organized",
            "Energetic and upbeat",
            "Effective communicator"
        ],
        "experience": [
            {
                "company": "Blink Fitness",
                "role": "Personal Trainer",
                "start_date": "October 2023",
                "end_date": "Present",
                "description": "Preparing comprehensive diet plans based on clients’ weight and health goals.\nAssisting with marketing efforts to retain existing clients.\nLeading fitness groups of 10-20 clients three times per week.\nDeveloping new fitness workshops. Increased new memberships by 32% by end of 2017.\nTracking client progress using specialized software to generate accurate reports."
            },
            {
                "company": "NYSC Flagship Astor Place",
                "role": "Personal Trainer",
                "start_date": "April 2022",
                "end_date": "September 2024",
                "description": "Designing and implementing tailored fitness programs for upscale clients.\nTraining 70 clients per month where 85% reached their fitness goals within six weeks.\nAdvising individual clients on health, nutrition, and lifestyle changes.\nProviding clients with safe and effective exercises they can perform at home.\nDemonstrating exercises for all clients to ensure proper technique."
            },
            {
                "company": "Crunch Fitness",
                "role": "Personal Fitness Trainer",
                "start_date": "February 2021",
                "end_date": "March 2022",
                "description": "Designing one-on-one nutritional plans for clients. More than 87% achieved their weight loss goals.\nIntroducing new gym members to personal training packages and conducting tours of the health club.\nEnsuring proper use and cleanliness of gym equipment.\nParticipating in PT Department meetings and promotional events."
            }
        ],
        "education": [
            {
                "institution": "Syracuse University",
                "degree": "Bachelor's Degree in Kinesiology",
                "start_year": "June 2018",
                "end_year": "January 2021"
            },
            {
                "institution": "",
                "degree": "Certification in First Aid and CPR",
                "start_year": "",
                "end_year": "November 2022"
            },
            {
                "institution": "",
                "degree": "ACE Certified Group Fitness Instructor",
                "start_year": "",
                "end_year": "March 2020"
            },
            {
                "institution": "",
                "degree": "ACE Certified Personal Trainer",
                "start_year": "",
                "end_year": "January 2019"
            }
        ]
    }
}

In [3]:
# question = personal experience of charly dolman

# First > Convert that input json into a vector and store it in a vector database
# Convert the question also to a vector and then take the question_vector and (cosine) search it in the vector database

In [4]:
# Qdrant is the vector database

In [5]:
import os
import uuid
from typing import List, Dict
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from google import genai   # google-genai SDK
import json

In [6]:
# --- CONFIG ---
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
COLLECTION_NAME = "resumes"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")  # set this env var with your key
EMBED_MODEL = "gemini-embedding-001"          # recommended model
# You may choose embedding_dim after inspecting the returned embedding length; default pick 1536 eventually.
DEFAULT_VECTOR_SIZE = 1536

In [8]:
# --- init clients ---
qclient = QdrantClient(url=QDRANT_URL)
# genai client; pass api_key explicitly
genai_client = genai.Client(api_key=GEMINI_API_KEY)

In [9]:
# Take the big nested dic -> break it down into small dict, and then convert each small dict into a vector
# > and then store those small vectors into the vector database

In [10]:
import uuid
import re

def clean_text(text: str) -> str:
    if not text:
        return ""
    # Normalize whitespace, remove weird bullets, collapse newlines
    text = re.sub(r"[•\u2022]", "-", text)   # convert bullet characters
    text = re.sub(r"\s+", " ", text).strip()
    return text



rid = resume_json["resume_id"]
data = resume_json["extracted_data"]
rname = data.get("name", "").strip()

chunks = []

# -------------------------
# Summary Section
# -------------------------
summary = clean_text(data.get("summary", ""))
if summary:
    chunks.append({
        "id": str(uuid.uuid4()),
        "text": f"[SUMMARY] {summary}",
        "metadata": {
            "resume_id": rid,
            "candidate_name": rname,
            "section": "summary"
        }
    })

# -------------------------
# Skills Section
# -------------------------
for s in data.get("skills", []):
    skill = clean_text(s)
    if not skill:
        continue
    chunks.append({
        "id": str(uuid.uuid4()),
        "text": f"[SKILL] {skill}",
        "metadata": {
            "resume_id": rid,
            "candidate_name": rname,
            "section": "skill",
            "skill": skill
        }
    })

# -------------------------
# Experience Section
# -------------------------
for exp in data.get("experience", []):
    role = clean_text(exp.get("role", ""))
    company = clean_text(exp.get("company", ""))
    start = exp.get("start_date")
    end = exp.get("end_date")
    desc = clean_text(exp.get("description", ""))

    text_parts = []
    if role or company:
        text_parts.append(f"{role} at {company}")
    if start or end:
        text_parts.append(f"({start} - {end})")
    if desc:
        text_parts.append(desc)

    full_text = " ".join(text_parts).strip()

    if full_text:
        chunks.append({
            "id": str(uuid.uuid4()),
            "text": f"[EXPERIENCE] {full_text}",
            "metadata": {
                "resume_id": rid,
                "candidate_name": rname,
                "section": "experience",
                "company": company,
                "role": role,
                "start_date": start,
                "end_date": end
            }
        })

# -------------------------
# Education Section
# -------------------------
for edu in data.get("education", []):
    degree = clean_text(edu.get("degree", ""))
    inst = clean_text(edu.get("institution", ""))
    start = edu.get("start_year")
    end = edu.get("end_year")

    text = f"{degree} at {inst}".strip()
    if start or end:
        text += f" ({start}-{end})"

    if text:
        chunks.append({
            "id": str(uuid.uuid4()),
            "text": f"[EDUCATION] {text}",
            "metadata": {
                "resume_id": rid,
                "candidate_name": rname,
                "section": "education",
                "institution": inst,
                "degree": degree
            }
        })

In [26]:
for chunk in chunks:
    print(chunk)

{'id': 'dd3bf9ac-10d9-4923-bde0-1efa4c77d958', 'text': '[SUMMARY] High energy Personal Trainer with 8 years experience in personal and group fitness. Proven track record in motivating clients to safely achieve weight loss goals through detailed and effective diet plans. Certified ACE Instructor with CPR training and a talent for developing fitness programs that exceed revenue targets and increase memberships.', 'metadata': {'resume_id': 12, 'candidate_name': 'Charly Dolman', 'section': 'summary'}}
{'id': '3411a729-65c4-4e74-a741-b5fbeed63409', 'text': '[SKILL] Knowledge in Nutrition', 'metadata': {'resume_id': 12, 'candidate_name': 'Charly Dolman', 'section': 'skill', 'skill': 'Knowledge in Nutrition'}}
{'id': '78315f05-ec64-40ce-aa5c-3f05fa3126d3', 'text': '[SKILL] Proficient at sales', 'metadata': {'resume_id': 12, 'candidate_name': 'Charly Dolman', 'section': 'skill', 'skill': 'Proficient at sales'}}
{'id': '2ee1c1a5-1d4f-4969-80f8-1381ad8a3d57', 'text': '[SKILL] Self-motivated', 'm

In [16]:
def embed_texts(texts):
    res = genai_client.models.embed_content(model=EMBED_MODEL, contents = texts)
    return res.embeddings

In [17]:
texts = [c["text"] for c in chunks]
texts

['[SUMMARY] High energy Personal Trainer with 8 years experience in personal and group fitness. Proven track record in motivating clients to safely achieve weight loss goals through detailed and effective diet plans. Certified ACE Instructor with CPR training and a talent for developing fitness programs that exceed revenue targets and increase memberships.',
 '[SKILL] Knowledge in Nutrition',
 '[SKILL] Proficient at sales',
 '[SKILL] Self-motivated',
 '[SKILL] Organized',
 '[SKILL] Energetic and upbeat',
 '[SKILL] Effective communicator',
 '[EXPERIENCE] Personal Trainer at Blink Fitness (October 2023 - Present) Preparing comprehensive diet plans based on clients’ weight and health goals. Assisting with marketing efforts to retain existing clients. Leading fitness groups of 10-20 clients three times per week. Developing new fitness workshops. Increased new memberships by 32% by end of 2017. Tracking client progress using specialized software to generate accurate reports.',
 '[EXPERIENCE

In [18]:
embeddings = embed_texts(texts)

In [19]:
embeddings

[ContentEmbedding(
   values=[
     -0.017790345,
     -0.003249489,
     0.031685192,
     -0.02794303,
     -0.007565486,
     <... 3067 more items ...>,
   ]
 ),
 ContentEmbedding(
   values=[
     0.011130208,
     -0.012043253,
     0.049660917,
     -0.031749263,
     -0.014531047,
     <... 3067 more items ...>,
   ]
 ),
 ContentEmbedding(
   values=[
     -0.009110915,
     0.0120729115,
     0.040417302,
     -0.08329942,
     -0.02145441,
     <... 3067 more items ...>,
   ]
 ),
 ContentEmbedding(
   values=[
     0.021294322,
     0.0035080328,
     0.041324373,
     -0.064470746,
     0.00973476,
     <... 3067 more items ...>,
   ]
 ),
 ContentEmbedding(
   values=[
     0.010358264,
     -0.022852026,
     0.016691564,
     -0.057521112,
     -0.0054352386,
     <... 3067 more items ...>,
   ]
 ),
 ContentEmbedding(
   values=[
     0.023145175,
     0.00322471,
     0.009106555,
     -0.0754084,
     -0.004346843,
     <... 3067 more items ...>,
   ]
 ),
 ContentEmbeddin

In [23]:
vec_len = len(embeddings[0].values)
vec_len

3072

In [24]:
qclient.recreate_collection(
    collection_name="resumes",
    vectors_config=qmodels.VectorParams(size=vec_len, distance=qmodels.Distance.COSINE),
)

  qclient.recreate_collection(


True

In [None]:
# Points -> an unique id + the vector + some additional metadata

In [25]:
l1 = [1,2,3]
l2 = ['a', 'b', 'c']
for e1, e2 in zip(l1, l2):
    print(e1, e2)

1 a
2 b
3 c


In [28]:
points = []
for c, vec in zip(chunks, embeddings):
    pid = str(uuid.uuid4())
    payload = c["metadata"]
    payload["text"] = c["text"]
    points.append(qmodels.PointStruct(id=pid, vector=vec.values, payload=payload))

In [30]:
# upsert
qclient.upsert(collection_name=COLLECTION_NAME, points=points)
print(f"Upserted {len(points)} points to {COLLECTION_NAME}.")

Upserted 14 points to resumes.


In [31]:
question = "experience of personal trainers"

In [32]:
input_question_vector = embed_texts([question])

In [34]:
ques_vector = input_question_vector[0].values

In [37]:
collection_name = "resumes"
res = qclient.query_points(
    collection_name = collection_name,
    query = ques_vector
)

In [38]:
results = []
for point in res.points:
    results.append({
        "id": point.id,
        "score": point.score,
        "payload": point.payload
    })

In [41]:
for r in results[0:3]:
    print(r["score"], r["payload"])

0.73127234 {'resume_id': 12, 'candidate_name': 'Charly Dolman', 'section': 'experience', 'company': 'Crunch Fitness', 'role': 'Personal Fitness Trainer', 'start_date': 'February 2021', 'end_date': 'March 2022', 'text': '[EXPERIENCE] Personal Fitness Trainer at Crunch Fitness (February 2021 - March 2022) Designing one-on-one nutritional plans for clients. More than 87% achieved their weight loss goals. Introducing new gym members to personal training packages and conducting tours of the health club. Ensuring proper use and cleanliness of gym equipment. Participating in PT Department meetings and promotional events.'}
0.7052334 {'resume_id': 12, 'candidate_name': 'Charly Dolman', 'section': 'experience', 'company': 'NYSC Flagship Astor Place', 'role': 'Personal Trainer', 'start_date': 'April 2022', 'end_date': 'September 2024', 'text': '[EXPERIENCE] Personal Trainer at NYSC Flagship Astor Place (April 2022 - September 2024) Designing and implementing tailored fitness programs for upscale