In [17]:
from git import Repo

# Loading the repo 
from langchain.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.git import GitLoader

# Parsing through the documents to create chunks
from langchain.text_splitter import Language
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter


from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
import os


In [9]:
repo_path = "test_repo/"
repo_link = "https://github.com/tarang1998/bodyScienceLLM"
repo = Repo.clone_from(repo_link, to_path=repo_path)

In [26]:
# Define supported programming languages and their file extensions
SUPPORTED_LANGUAGES = {
    Language.PYTHON: ['.py', '.pyx', '.pyi'],
    Language.JS: ['.js', '.jsx', '.ts', '.tsx'],  # JavaScript/TypeScript
    Language.JAVA: ['.java'],
    Language.CPP: ['.cpp', '.cc', '.cxx', '.h', '.hpp'],
    Language.CSHARP: ['.cs'],
    Language.PHP: ['.php'],
    Language.RUBY: ['.rb'],
    Language.RUST: ['.rs'],
    Language.GO: ['.go'],
    Language.SCALA: ['.scala'],
    Language.KOTLIN: ['.kt', '.kts'],
    Language.LUA: ['.lua'],
    Language.PERL: ['.pl', '.pm'],
    Language.ELIXIR: ['.ex', '.exs'],
    Language.COBOL: ['.cob', '.cbl'],
}



In [27]:
# Text and document files (use default chunking)
TEXT_FILES = {
    '.txt', '.md', '.markdown', '.rst', '.adoc', '.asciidoc',
    '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
    '.html', '.htm', '.css', '.scss', '.sass',
    '.csv', '.tsv', '.xml', '.rss', '.atom',
    '.mdx', '.log', '.readme', '.license', '.changelog',
    '.dockerfile', '.dockerignore', '.gitignore', '.gitattributes',
    '.env', '.env.example', '.env.local',
    '.sh', '.bash', '.zsh', '.fish',  # Shell scripts
    '.sql', '.psql', '.mysql',  # SQL files
    '.dockerfile', '.docker-compose.yml', '.docker-compose.yaml',
    '.yaml', '.yml',  # YAML files
    '.json', '.jsonc',  # JSON files
    '.xml', '.xsd', '.xslt',  # XML files
    '.html', '.htm', '.xhtml',  # HTML files
    '.css', '.scss', '.sass', '.less',  # CSS files
    '.md', '.markdown', '.mdown',  # Markdown files
    '.txt', '.text',  # Plain text files
    '.log', '.out', '.err',  # Log files
    '.ini', '.cfg', '.conf', '.config',  # Config files
    '.toml', '.lock',  # TOML files
    '.rss', '.atom', '.feed',  # Feed files
    '.csv', '.tsv', '.tab',  # Data files
    '.readme', '.license', '.changelog', '.contributing',  # Documentation
    '.gitignore', '.gitattributes', '.gitmodules',  # Git files
    '.dockerignore', '.dockerfile',  # Docker files
    '.env', '.env.example', '.env.local', '.env.production',  # Environment files
}


In [32]:
def get_file_extension(file_path):
    """Extract file extension from path"""
    return os.path.splitext(file_path)[1].lower()

def create_language_filter(languages=None):
    """Create a file filter for specified languages"""
    if languages is None:
        # Include all supported extensions
        allowed_extensions = []
        for extensions in SUPPORTED_LANGUAGES.values():
            allowed_extensions.extend(extensions)

        allowed_extensions.extend(TEXT_FILES)
    else:
        # Include only specified languages
        allowed_extensions = []
        for lang in languages:
            if lang in SUPPORTED_LANGUAGES:
                allowed_extensions.extend(SUPPORTED_LANGUAGES[lang])
            elif lang == "TEXT":
                allowed_extensions.extend(TEXT_FILES)
    
    def file_filter(file_path):
        return get_file_extension(file_path) in allowed_extensions
    
    return file_filter

In [None]:
# A GenericLoader from LangChain that will scan and parse Python files from a repository for code analysis.
loader = GenericLoader.from_filesystem(
    repo_path,  # The directory path where the repository is cloned
    glob = "**/*", # Searches recursively through all subdirectories and files
    suffixes=[".py",".html"], # Only processes Python files 
    parser = LanguageParser( 
        language=Language.PYTHON, # Specifies Python as the target language
        parser_threshold=500) # Sets a size limit of 500 characters for parsing chunks
)

documents = loader.load()
documents



[Document(metadata={'source': 'test_repo\\ test.py', 'language': <Language.JS: 'js'>}, page_content=''),
 Document(metadata={'source': 'test_repo\\app.py', 'language': <Language.JS: 'js'>}, page_content='from flask import Flask, render_template, jsonify, request\nfrom src.helper import download_hugging_face_embeddings\nfrom langchain_pinecone import PineconeVectorStore\nfrom langchain_openai import OpenAI\nfrom langchain.chains import create_retrieval_chain\nfrom langchain.chains.combine_documents import create_stuff_documents_chain\nfrom langchain_core.prompts import ChatPromptTemplate\nfrom dotenv import load_dotenv\nfrom src.prompt import *\nimport os\n\napp = Flask(__name__)\n\nload_dotenv()\n\nPINECONE_API_KEY=os.environ.get(\'PINECONE_API_KEY\')\nOPENAI_API_KEY=os.environ.get(\'OPENAI_API_KEY\')\n\nos.environ["PINECONE_API_KEY"] = PINECONE_API_KEY\nos.environ["OPENAI_API_KEY"] = OPENAI_API_KEY\n\nembeddings = download_hugging_face_embeddings()\n\n\nindex_name = "bodybuildingchatb

In [92]:
# Load with GitLoader first
git_loader = GitLoader(
    
    repo_path= repo_path,
    clone_url= repo_link,
    branch = "main",
    file_filter=create_language_filter()  # All languages
)

# Load documents
documents = git_loader.load()
documents



[Document(metadata={'source': ' test.py', 'file_path': ' test.py', 'file_name': ' test.py', 'file_type': '.py'}, page_content=''),
 Document(metadata={'source': 'README.md', 'file_path': 'README.md', 'file_name': 'README.md', 'file_type': '.md'}, page_content='# BodyScienceLLM\r\n\r\n![alt text](./screenshots/image.png)\r\n\r\n### Techology and Tools used\r\n- Python (version 3.9)\r\n- Lang Chain \r\n- Hugging Face Embedding Model \r\n- PineCone\r\n- OpenAI \r\n- Flask\r\n\r\n### Running the application\r\n\r\n- Clone the repository\r\n```\r\ngit clone https://github.com/tarang1998/bodyScienceLLM.git\r\n```\r\n- Create and activate the python environment\r\n\r\n```\r\nconda create -n bodyScience python=3.10 -y\r\nconda activate bodyScience\r\n```\r\n\r\n- Install the requirements \r\n```\r\npip install -r requirements.txt\r\n```\r\n\r\n- Add the OPENAI_API_KEY and PINECONE_API_KEY in the .env file \r\n\r\n- Save the vector embeddings to Pinecone\r\n```\r\npython store_index.py\r\n```\r

In [93]:
# Process documents with language-specific parsing + fallback
processed_docs = []

for doc in documents:
    file_ext = get_file_extension(doc.metadata.get('source', ''))
    
    # Determine language from file extension
    language = None
    for lang, extensions in SUPPORTED_LANGUAGES.items():
        if file_ext in extensions:
            language = lang
            break
    
    # if language:
    #     # Use language-specific parser for supported languages
    #     # Makes code semantics explicit
    #     # Improves AI understanding
    #     # Enables better search and retrieval
    #     # Facilitates code analysis
    #     # Enhances documentation generation
    #     # Improves vector embeddings
    #     # Enables precise code queries
    #     try:
    #         language_parser = LanguageParser(
    #             language=language,
    #             # parser_threshold=500
    #         )
    #         # Parse the content with language-specific parser
    #         parsed_content = language_parser.parse(doc)
    #         doc.page_content = parsed_content
    #         print(f"Used language parser for {file_ext}: {doc.metadata.get('source', 'Unknown')}")
    #     except Exception as e:
    #         print(f"Language parser failed for {file_ext}, using default: {e}")
    #         # Fallback to default parsing (no change to content)
    # else:
    #     # Use default parsing for text files (no language parser)
    #     print(f"Using default parser for {file_ext}: {doc.metadata.get('source', 'Unknown')}")

    
    # Apply text splitting (language-aware if possible, otherwise default)
    # Language specific enables the parser to understand Python code structure (functions, classes, imports, etc.)
    # It can intelligently split code based on Python syntax rather than just arbitrary text breaks
    # The parser will try to break Python code at logical boundaries (like function definitions, class boundaries, etc.)
    # Instead of breaking mid-function or mid-statement, it respects Python syntax
    # Preserves context: Keeps related code together (e.g., a function and its docstring)
    # Maintains structure: Respects Python's indentation and block structure
    # Better embeddings: Creates more meaningful chunks for AI analysis
    try:
        if language:
            # Use language-specific text splitter
            text_splitter = RecursiveCharacterTextSplitter.from_language(
                language=language,
                chunk_size=500,
                chunk_overlap=20
            )
            print("Using language-specific text splitter : ", doc.metadata)
        else:
            # Use default text splitter for non-programming files
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=20
            )
            print("Using default text splitter : ", doc.metadata)
        
        chunks = text_splitter.split_documents([doc])
        processed_docs.extend(chunks)

    except Exception as e:
        print(f"Text splitting failed for {file_ext}, using default: {e}")
        # Fallback to default text splitting
        default_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=20
        )
        chunks = default_splitter.split_documents([doc])
        processed_docs.extend(chunks)

print(f"\nTotal documents loaded: {len(documents)}")
print(f"Total chunks created: {len(processed_docs)}")
print(f"Files processed:")
for doc in documents:
    file_path = doc.metadata.get('source', 'Unknown')
    file_ext = get_file_extension(file_path)
    print(f"  - {file_path} ({file_ext})")


Using language-specific text splitter :  {'source': ' test.py', 'file_path': ' test.py', 'file_name': ' test.py', 'file_type': '.py'}
Using default text splitter :  {'source': 'README.md', 'file_path': 'README.md', 'file_name': 'README.md', 'file_type': '.md'}
Using language-specific text splitter :  {'source': 'app.py', 'file_path': 'app.py', 'file_name': 'app.py', 'file_type': '.py'}
Using default text splitter :  {'source': 'requirements.txt', 'file_path': 'requirements.txt', 'file_name': 'requirements.txt', 'file_type': '.txt'}
Using language-specific text splitter :  {'source': 'setup.py', 'file_path': 'setup.py', 'file_name': 'setup.py', 'file_type': '.py'}
Using language-specific text splitter :  {'source': 'store_index.py', 'file_path': 'store_index.py', 'file_name': 'store_index.py', 'file_type': '.py'}
Using language-specific text splitter :  {'source': 'template.py', 'file_path': 'template.py', 'file_name': 'template.py', 'file_type': '.py'}
Using language-specific text spli