In [1]:
# Install required dependencies
!pip install -qU langchain openai chromadb tiktoken gitpython transformers sentence-transformers torch faiss-cpu tree-sitter tree-sitter-python
!apt-get update && apt-get install -y git

# For AST parsing
!pip install -qU libclang
!python -m pip install 'git+https://github.com/tree-sitter/tree-sitter-python'

# For code analysis
!pip install -qU pygments

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m787.8/787.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m634.6/634.6 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m6.9 MB/s[0m eta [36m0

In [4]:
import os
import tempfile
import requests
import subprocess
import ast
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import hashlib

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseRetriever

# Code analysis imports
import tree_sitter
from tree_sitter import Language, Parser
import pygments
from pygments.lexers import get_lexer_for_filename
from pygments.token import Token

# Hugging Face imports
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig
)

import torch
from huggingface_hub import login

In [4]:
# @title **Step 1: Install and Import Dependencies**
print("Installing required libraries...")
!pip install -qU langchain chromadb sentence-transformers gitpython transformers accelerate bitsandbytes

print("Libraries installed! Now importing...")
import os
import subprocess
import ast
from pathlib import Path
from typing import List, Dict, Any
import hashlib

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

# Hugging Face imports
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch

# @title **Step 2: Define Configuration and Helper Functions**
class Config:
    EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  # Good general-purpose model
    LLM_MODEL_NAME = "microsoft/DialoGPT-medium"  # Lightweight model for Colab
    CHUNK_SIZE = 512
    CHUNK_OVERLAP = 50
    CACHE_DIR = "./repository_cache"
    PERSIST_DIRECTORY = "./chroma_db"

config = Config()

def clone_repository(repo_url: str, local_path: str) -> bool:
    """Clones a GitHub repository."""
    try:
        if os.path.exists(local_path):
            subprocess.run(["rm", "-rf", local_path], check=True)
        result = subprocess.run(["git", "clone", repo_url, local_path],
                               capture_output=True, text=True, timeout=300)
        return result.returncode == 0
    except Exception as e:
        print(f"Error cloning: {e}")
        return False

def analyze_repository_structure(repo_path: str) -> Dict[str, Any]:
    """Analyzes the repo to find frameworks, entry points, etc."""
    structure = {"languages": {}, "files_by_type": {}, "entry_points": [], "framework": None, "package_manager": None}
    framework_indicators = {
        "react": ["package.json", "src/App.js", "src/App.jsx", "src/index.js"],
        "django": ["manage.py", "requirements.txt", "wsgi.py"],
        "flask": ["app.py", "application.py", "requirements.txt", "wsgi.py"],
        "spring": ["pom.xml", "src/main/java", "application.properties"],
    }
    package_managers = {"npm": "package.json", "pip": "requirements.txt", "maven": "pom.xml"}

    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, repo_path)
            _, ext = os.path.splitext(file)
            if ext:
                if ext not in structure["files_by_type"]:
                    structure["files_by_type"][ext] = []
                structure["files_by_type"][ext].append(rel_path)
            for framework, indicators in framework_indicators.items():
                for indicator in indicators:
                    if rel_path.endswith(indicator):
                        structure["framework"] = framework
            for manager, file_name in package_managers.items():
                if file == file_name:
                    structure["package_manager"] = manager
            if file in ["main.py", "app.py", "index.js", "server.js", "Main.java"]:
                structure["entry_points"].append(rel_path)
    return structure

# @title **Step 3: Process the Repository and Create the AI Agent**
def process_repository(repo_path: str) -> List[Document]:
    """Reads all files in the repo and splits them into chunks."""
    documents = []
    ignored_dirs = ['.git', '__pycache__', 'node_modules']
    text_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.PYTHON, chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP
    )

    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if d not in ignored_dirs]
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, repo_path)
            _, ext = os.path.splitext(file)
            if ext not in ['.py', '.js', '.java', '.md', '.txt']:  # Process only these file types for simplicity
                continue
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                if not content.strip():
                    continue
                chunks = text_splitter.split_text(content)
                for i, chunk in enumerate(chunks):
                    doc = Document(
                        page_content=chunk,
                        metadata={"source": rel_path, "chunk": i, "total_chunks": len(chunks)}
                    )
                    documents.append(doc)
            except Exception as e:
                print(f"Error with {file_path}: {e}")
                continue
    return documents

def initialize_llm():
    """Initializes a lightweight LLM for Colab."""
    print("Loading the language model...")
    tokenizer = AutoTokenizer.from_pretrained(config.LLM_MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        config.LLM_MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    text_gen_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.95,
        repetition_penalty=1.15
    )
    return HuggingFacePipeline(pipeline=text_gen_pipeline)

def create_qa_system(documents):
    """Creates the full question-answering system."""
    print("Creating vector database...")
    embeddings = HuggingFaceEmbeddings(model_name=config.EMBEDDING_MODEL_NAME)
    vector_store = Chroma.from_documents(documents, embeddings, persist_directory=config.PERSIST_DIRECTORY)
    vector_store.persist()

    print("Setting up the AI agent...")
    llm = initialize_llm()

    prompt_template = """You are an expert programmer analyzing a codebase. Use the context below to answer the question.

Context:
{context}

Question: {question}

Provide a detailed, technical answer. If unsure, say so.
Answer:"""
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 4}),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )
    return qa_chain

# @title **Step 4: MAIN - Setup the Codebase Explainer**
# @markdown ### **Run this cell to start the setup!**
# @markdown Paste the URL of any public GitHub repository below.

repository_url = "https://github.com/trungrockyngo/EVChargerReg" # @param {type:"string"}

print("🚀 Starting Codebase Explainer Setup...")
print(f"📦 Repository: {repository_url}")

# 1. Clone the repo
repo_name = repository_url.split("/")[-1].replace(".git", "")
repo_hash = hashlib.md5(repository_url.encode()).hexdigest()[:8]
local_path = os.path.join(config.CACHE_DIR, f"{repo_name}_{repo_hash}")
os.makedirs(config.CACHE_DIR, exist_ok=True)

if not clone_repository(repository_url, local_path):
    print("❌ Failed to clone the repository. Please check the URL and try again.")
else:
    print("✅ Repository cloned successfully!")

    # 2. Analyze its structure
    print("🔍 Analyzing repository structure...")
    repo_structure = analyze_repository_structure(local_path)
    print(f"   Detected Framework: {repo_structure.get('framework', 'Unknown')}")
    print(f"   Package Manager: {repo_structure.get('package_manager', 'Unknown')}")
    if repo_structure['entry_points']:
        print(f"   Possible Entry Points: {', '.join(repo_structure['entry_points'][:3])}")

    # 3. Process all files
    print("📂 Processing files and creating knowledge base...")
    all_docs = process_repository(local_path)
    print(f"   Created {len(all_docs)} chunks of knowledge.")

    # 4. Create the AI agent
    qa_system = create_qa_system(all_docs)
    print("✅ Setup complete! You can now ask questions in the next cell.")

# @title **Step 5: Ask Questions About the Codebase**
# @markdown ### **Run this cell after the setup is complete.**
your_question = "What is the main purpose of this codebase and how is it structured?" # @param {type:"string"}

print(f"❓ Your Question: {your_question}")
print("🤖 Thinking...")
try:
    result = qa_system({"query": your_question})
    print("\n" + "="*50)
    print("💡 Answer:\n")
    print(result["result"])
    print("\n📚 Sources used:")
    for doc in result['source_documents']:
        print(f"   - {doc.metadata['source']} (Chunk {doc.metadata['chunk']+1})")
    print("="*50)
except Exception as e:
    print(f"An error occurred: {e}")
    print("Please ensure the previous setup cell ran successfully.")

Installing required libraries...
Libraries installed! Now importing...
🚀 Starting Codebase Explainer Setup...
📦 Repository: https://github.com/trungrockyngo/EVChargerReg
✅ Repository cloned successfully!
🔍 Analyzing repository structure...
   Detected Framework: react
   Package Manager: npm
   Possible Entry Points: backend/routes/index.js, chaincode/index.js, frontend/src/index.js
📂 Processing files and creating knowledge base...
   Created 222 chunks of knowledge.
Creating vector database...
Setting up the AI agent...
Loading the language model...


  vector_store.persist()


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0
  return HuggingFacePipeline(pipeline=text_gen_pipeline)
  result = qa_system({"query": your_question})


✅ Setup complete! You can now ask questions in the next cell.
❓ Your Question: What is the main purpose of this codebase and how is it structured?
🤖 Thinking...

💡 Answer:

You are an expert programmer analyzing a codebase. Use the context below to answer the question.

Context:
| Device        	| An EV Charger Device.                                     	| Read access to device data.                                  	|
| Super System  	| Main terminal that carries out administrative functions.  	| Read and write access to controllers.                        	|
| User          	| EV Charger user                                           	| Read access to a device.                                     	|
<br>
---
<h2> Blockchain Architectural Diagrams </h2>

---

<h2> Other Functions Details </h2>

* Agreement on the accuracy and latest information about the controllers and their related devices on the blockchain network  
* Updates to the codes. Deliberation on features or initiatives f

In [2]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (