![Img](https://app.theheadstarter.com/static/hs-logo-opengraph.png)

# Headstarter Codebase RAG Project

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/48dd9de1-b4d2-4318-8f52-85ec209d8ebc)

# Install Necessary Libraries

In [1]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.3-py3-none-any.whl.metadata (1.3 kB)
Collecting pynacl>=1.4.0 (from pygithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

# Clone a GitHub Repo locally

In [3]:
github_repo = "https://github.com/CoderAgent/SecureAgent"
github_repo.split("/")[-1]

'SecureAgent'

In [4]:
def clone_repository(repo_url):

    """Clones a GitHub repository to a temporary directory.

    Args:
        repo_url: The URL of the GitHub repository.

    Returns:
        The path to the cloned repository.
    """
    repo_name = repo_url.split("/")[-1]
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, repo_path)
    return repo_path

In [5]:
path = clone_repository(github_repo)

# Define which types of files to parse and which files / folders to ignore

In [6]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java',
                         '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h'}

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor'}

In [7]:
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
      with open(file_path, 'r', encoding = "utf-8") as f:
          content = f.read()
      rel_path = os.path.relpath(file_path, repo_path)
      return {
          "name": rel_path,
          "content":content
      }
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None



def get_main_files_content(repo_path: str):
    """
    Get content of supported code files from the local repository.

    Args:
        repo_path: Path to the local repository

    Returns:
        List of dictionaries containing file names and contents
    """
    files_content = []
    try:
      for root, _, files in os.walk(repo_path):
        if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
            continue
        for file in files:
          file_path = os.path.join(root, file)
          if os.path.splitext(file_path)[1] in SUPPORTED_EXTENSIONS:
              file_content = get_file_content(file_path, repo_path)
              if file_content:
                  files_content.append(file_content)
    except:
        print(f"Error reading files in {repo_path}")
        return None
    return files_content

In [8]:
files_content = get_main_files_content(path)

In [9]:
files_content

[{'name': 'src/review-agent.ts',
  'content': 'import { Octokit } from "@octokit/rest";\nimport { WebhookEventMap } from "@octokit/webhooks-definitions/schema";\nimport { ChatCompletionMessageParam } from "groq-sdk/resources/chat/completions";\nimport * as xml2js from "xml2js";\nimport type {\n  BranchDetails,\n  BuilderResponse,\n  Builders,\n  CodeSuggestion,\n  PRFile,\n  PRSuggestion,\n} from "./constants";\nimport { PRSuggestionImpl } from "./data/PRSuggestionImpl";\nimport { generateChatCompletion } from "./llms/chat";\nimport {\n  PR_SUGGESTION_TEMPLATE,\n  buildPatchPrompt,\n  constructPrompt,\n  getReviewPrompt,\n  getTokenLength,\n  getXMLReviewPrompt,\n  isConversationWithinLimit,\n} from "./prompts";\nimport {\n  INLINE_FIX_FUNCTION,\n  getInlineFixPrompt,\n} from "./prompts/inline-prompt";\nimport { getGitFile } from "./reviews";\n\nexport const reviewDiff = async (messages: ChatCompletionMessageParam[]) => {\n  const message = await generateChatCompletion({\n    messages,

# Embeddings

In [10]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [11]:
text = "i like coding"

embeddings = get_huggingface_embeddings(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
embeddings


array([-8.99214391e-03,  7.10570812e-02, -4.78132404e-02,  2.93951924e-03,
        1.87183879e-02,  6.25578165e-02, -7.40241334e-02, -1.79869793e-02,
       -2.33030953e-02,  2.52969563e-02,  1.87584516e-02,  3.26438658e-02,
        2.42187642e-02,  1.06377289e-01,  1.96239557e-02,  4.58664587e-03,
       -2.44689845e-02, -2.76926570e-02, -3.92995449e-03, -1.75862219e-02,
        7.31050968e-03,  3.00472043e-02, -3.48030999e-02, -1.51053397e-02,
        4.95744497e-03, -1.16299819e-02,  2.20015477e-02, -6.58035874e-02,
       -2.77131312e-02,  1.00825511e-01,  3.28472641e-04, -9.48085822e-03,
       -2.19271723e-02, -4.70387796e-03,  1.50509811e-06,  1.31359547e-02,
       -1.65854990e-02,  1.93589348e-02, -2.62506306e-02,  1.61743432e-04,
       -1.85249373e-02, -1.79329310e-02, -2.03854069e-02,  2.96985377e-02,
       -5.77551797e-02,  1.27474498e-02,  8.77798274e-02, -6.04865141e-02,
        9.92059615e-03, -2.96356127e-04,  1.08571432e-03, -1.75144020e-02,
       -3.35006081e-02, -

# Setting up Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [14]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [15]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())


sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

In [20]:
# Insert the codebase embeddings into Pinecone

documents = []

for file in files_content:
  doc = Document(
    page_content = f"{file['name']}\n\n{file['content']}",
    metadata = {"source": file['name']}
  )
  documents.append(doc)
vectorstore = PineconeVectorStore.from_documents(
    documents = documents,
    embedding = HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace= "https://github.com/CoderAgent/SecureAgent"
  )




  embedding = HuggingFaceEmbeddings(),


In [21]:
documents

[Document(metadata={'source': 'src/review-agent.ts', 'text': 'src/review-agent.ts\n\nimport { Octokit } from "@octokit/rest";\nimport { WebhookEventMap } from "@octokit/webhooks-definitions/schema";\nimport { ChatCompletionMessageParam } from "groq-sdk/resources/chat/completions";\nimport * as xml2js from "xml2js";\nimport type {\n  BranchDetails,\n  BuilderResponse,\n  Builders,\n  CodeSuggestion,\n  PRFile,\n  PRSuggestion,\n} from "./constants";\nimport { PRSuggestionImpl } from "./data/PRSuggestionImpl";\nimport { generateChatCompletion } from "./llms/chat";\nimport {\n  PR_SUGGESTION_TEMPLATE,\n  buildPatchPrompt,\n  constructPrompt,\n  getReviewPrompt,\n  getTokenLength,\n  getXMLReviewPrompt,\n  isConversationWithinLimit,\n} from "./prompts";\nimport {\n  INLINE_FIX_FUNCTION,\n  getInlineFixPrompt,\n} from "./prompts/inline-prompt";\nimport { getGitFile } from "./reviews";\n\nexport const reviewDiff = async (messages: ChatCompletionMessageParam[]) => {\n  const message = await g

# Perform RAG

1. Get your OpenRouter API Key [here](https://openrouter.ai/settings/keys)

2. Paste your OpenRouter Key into your Google Colab secrets, and make sure to enable permissions for it

![Image](https://github.com/user-attachments/assets/bd64c5aa-952e-4a1e-9ac0-01d8fe93aaa1)


In [22]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=userdata.get("OPEN")
)

In [23]:
query = "How is the python parser used?"

In [24]:
raw_query_embedding = get_huggingface_embeddings(query)

In [25]:
raw_query_embedding

array([ 5.57475090e-02, -4.89690378e-02, -8.03438481e-03,  1.77330337e-02,
       -3.26084830e-02,  6.38567805e-02,  2.73242127e-03, -1.15213385e-02,
       -1.75935291e-02, -7.93116316e-02,  3.00973523e-02,  1.57527365e-02,
        4.63028811e-02,  4.52013016e-02, -2.21529277e-03, -3.99203040e-02,
        2.28323080e-02, -9.56473779e-03,  3.77896354e-02,  3.99982855e-02,
       -3.65055352e-02,  3.92722636e-02, -1.14548346e-02,  4.60235067e-02,
       -6.21894328e-03,  2.18612459e-02, -9.04879440e-03,  8.45675357e-03,
       -3.39663066e-02, -6.26854897e-02, -2.80694552e-02, -2.23253071e-02,
        2.63326261e-02, -7.03580864e-03,  1.06159985e-06,  1.17442469e-04,
       -5.08942902e-02,  2.25566197e-02, -2.69679762e-02,  3.29341702e-02,
        7.58820176e-02, -6.14448339e-02,  4.27865312e-02, -2.23320257e-03,
       -3.98334116e-02, -3.09520941e-02,  5.83768897e-02, -4.87756953e-02,
        6.95667043e-02,  5.62432855e-02, -2.70433421e-03, -4.53165285e-02,
       -1.22045195e-02,  

In [27]:
top_matches = pinecone_index.query(
    vector=raw_query_embedding.tolist(),
    top_k=5,
    include_metadata=True,
    namespace="https://github.com/CoderAgent/SecureAgent"
)

In [28]:
top_matches

{'matches': [{'id': '1c94a3f0-f77c-4945-b591-2d17ce31e42e',
              'metadata': {'source': 'src/context/language/python-parser.ts',
                           'text': 'src/context/language/python-parser.ts\n'
                                   '\n'
                                   'import { AbstractParser, EnclosingContext '
                                   '} from "../../constants";\n'
                                   'export class PythonParser implements '
                                   'AbstractParser {\n'
                                   '  findEnclosingContext(\n'
                                   '    file: string,\n'
                                   '    lineStart: number,\n'
                                   '    lineEnd: number\n'
                                   '  ): EnclosingContext {\n'
                                   '    // TODO: Implement this method for '
                                   'Python\n'
                                   '    re

In [29]:
context = [item['metadata']['text'] for item in top_matches['matches']]

In [30]:
context

['src/context/language/python-parser.ts\n\nimport { AbstractParser, EnclosingContext } from "../../constants";\nexport class PythonParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    // TODO: Implement this method for Python\n    return null;\n  }\n  dryRun(file: string): { valid: boolean; error: string } {\n    // TODO: Implement this method for Python\n    return { valid: false, error: "Not implemented yet" };\n  }\n}\n',
 'src/context/language/javascript-parser.ts\n\nimport { AbstractParser, EnclosingContext } from "../../constants";\nimport * as parser from "@babel/parser";\nimport traverse, { NodePath, Node } from "@babel/traverse";\n\nconst processNode = (\n  path: NodePath<Node>,\n  lineStart: number,\n  lineEnd: number,\n  largestSize: number,\n  largestEnclosingContext: Node | null\n) => {\n  const { start, end } = path.node.loc;\n  if (start.line <= lineStart && lineEnd <= en

In [31]:
augmented_query = "<CONTEXT>\n" + "\n\n---------\n\n".join(context) + "\n\n---------\n\n" + query

In [33]:
print(augmented_query)

<CONTEXT>
src/context/language/python-parser.ts

import { AbstractParser, EnclosingContext } from "../../constants";
export class PythonParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    // TODO: Implement this method for Python
    return null;
  }
  dryRun(file: string): { valid: boolean; error: string } {
    // TODO: Implement this method for Python
    return { valid: false, error: "Not implemented yet" };
  }
}


---------

src/context/language/javascript-parser.ts

import { AbstractParser, EnclosingContext } from "../../constants";
import * as parser from "@babel/parser";
import traverse, { NodePath, Node } from "@babel/traverse";

const processNode = (
  path: NodePath<Node>,
  lineStart: number,
  lineEnd: number,
  largestSize: number,
  largestEnclosingContext: Node | null
) => {
  const { start, end } = path.node.loc;
  if (start.line <= lineStart && lineEnd <= end.line) {
    c

In [34]:
system_promt = """You are a senior software engineer, who is an expert in
TypeScript.

Answer the question I have about the codebase based on the context provided.
Always consider all of the context provided to answer my question.

"""

llm_response = client.chat.completions.create(
    model = "deepseek/deepseek-r1:free",
    messages = [
        {"role": "system", "content" : system_promt},
        {"role": "user", "content": augmented_query}
    ]
)

In [35]:
response = llm_response.choices[0].message.content
response

"The Python parser is currently a placeholder and not fully implemented, but here's how it fits into the architecture:\n\n1. **Implementation Status**: The `PythonParser` class in `python-parser.ts` implements the `AbstractParser` interface with stubs for:\n   - `findEnclosingContext()` - Not implemented (returns null)\n   - `dryRun()` - Not implemented (returns default error)\n\n2. **Integration Point**: The parser would be selected for Python files via the `smarterContextPatchStrategy` in `review.ts`, which calls `getParserForExtension()` (presumably mapped to Python file extensions like .py)\n\n3. **Behavior for Python Files**:\n   - Current functionality falls back to `expandedPatchStrategy`\n   - Unlike JavaScript's AST analysis, uses simple line-based context expansion\n   - Adds ±5 lines around diff hunks instead of finding enclosing functions/classes\n\n4. **Key Difference from JS**:\n   ```typescript\n   // JavaScript uses AST traversal from @babel/parser:\n   findEnclosingCon