In [29]:
from github import Github
import base64
import os

def load_github_profile_repos(profile_url):
    token = os.getenv("GITHUB_TOKEN")
    g = Github(token)

    username = profile_url.rstrip("/").split("/")[-1]
    repos = g.get_user(username).get_repos()
    project_list = []

    for repo in repos:
        try:
            readme = repo.get_readme()
            readme_text = base64.b64decode(readme.content).decode("utf-8")
        except:
            readme_text = ""

        project_info = {
            "name": repo.name,
            "description": repo.description or "",
            "topics": repo.get_topics() or [],
            "url": repo.html_url,
            "readme": readme_text
        }

        project_list.append(project_info)

    return project_list

In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
def chunk_all_projects(project_list, chunk_size=500, chunk_overlap=50):
    all_chunks = []

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    for proj in project_list:
        text = f"""
Project Name: {proj['name']}
URL: {proj['url']}
Description: {proj['description']}
Topics: {", ".join(proj['topics'])}
README:
{proj['readme']}
"""
        chunks = splitter.split_text(text)

        for c in chunks:
            all_chunks.append({
                "text": c,
                "project_name": proj["name"],
                "url": proj["url"],
                "topics": proj["topics"],
            })

    return all_chunks

In [42]:
project_list = load_github_profile_repos("https://github.com/vainavinair")
print(f"Loaded {len(project_list)} projects")

  g = Github(token)


Loaded 21 projects


In [33]:
project_chunks= chunk_all_projects(project_list)

In [34]:
from sentence_transformers import SentenceTransformer,util
model = SentenceTransformer("all-MiniLM-L6-v2")
project_chunk_embeddings = model.encode(
    [chunk["text"] for chunk in project_chunks],
    convert_to_tensor=True
)

In [39]:
def retrieve_github_chunks(job_description, project_chunks, project_embeddings, k=4):
    job_emb = model.encode(job_description, convert_to_tensor=True)
    scores = util.cos_sim(job_emb, project_embeddings)[0]

    selected = []
    for score, chunk in zip(scores.cpu().numpy(), project_chunks):
        selected.append({
            "score": float(score),
            "text": chunk["text"],
            "project_name": chunk["project_name"],
            "url": chunk["url"],
            "topics": chunk["topics"],
        })

    # Sort by similarity score
    selected = sorted(selected, key=lambda x: x["score"], reverse=True)[:k]
    return selected

In [40]:
retrieved_chunks = retrieve_github_chunks(
    "Looking for a Python developer with experience in web scraping and data analysis.",
    project_chunks,
    project_chunk_embeddings
)

for item in retrieved_chunks:
    print(f"""
Score: {item['score']}
Project: {item['project_name']}
URL: {item['url']}
Topics: {item['topics']}

Chunk:
{item['text']}
""")


Score: 0.44081830978393555
Project: RAG-cold-email
URL: https://github.com/vainavinair/RAG-cold-email
Topics: []

Chunk:
### Next steps:
1. Need to modularize the code
2. Add error handling and logging
3. Go with the scrapping data? or use some API to get real job data
4. Add more data sources fo user context like linkedin profile, portfolio website, user preferences etc.
5. Prompt better, maybe use few shot prompting for email generation


Score: 0.41533350944519043
Project: FlashForge
URL: https://github.com/vainavinair/FlashForge
Topics: []

Chunk:
## Technology Stack

- **Backend**: Flask, Python 3.12
- **Database**: SQLite with persistent storage
- **AI/ML**: Google Gemini API, Hybrid Scheduler (Thompson Sampling + Knowledge Tracing)
- **OCR**: Tesseract OCR
- **Production**: Gunicorn WSGI server

## Documentation

- [Deployment Guide](DEPLOYMENT.md) - Complete Render.com deployment instructions
- [Evaluation Guide](EVALUATION_GUIDE.md) - Research evaluation metrics and methodolo