In [1]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [None]:
import os
import dotenv
from git import Repo

In [5]:
repo_path = "./test_repo"

In [6]:
repo = Repo.clone_from("https://github.com/langchain-ai/langchain", to_path=repo_path)

In [None]:
loader = GenericLoader.from_filesystem(
    repo_path + "lib/core/langchain/_core",
    glob="++/+",
    suffixes = [".py"],
    exclude=["**/non-utf-8-enconding.py"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

documents = loader.load()
len(documents)

In [None]:
python_spliter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap = 200
)

texts = python_spliter.split_documents(documents)
len(texts)

In [None]:
%load_ext dotenv
%dotenv

In [None]:
db = Chroma.from_documents(texts, OpenAIEmbeddings(disallowed_special=()))

retriever = db.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k": 8},
)