In [1]:
from langchain import HuggingFaceHub
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Qdrant
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!git clone https://github.com/tsnAnh/flutter-bloc-base-source-code codebase

Cloning into 'codebase'...
remote: Enumerating objects: 218, done.[K
remote: Counting objects: 100% (218/218), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 218 (delta 26), reused 209 (delta 19), pack-reused 0[K
Receiving objects: 100% (218/218), 82.55 KiB | 1.10 MiB/s, done.
Resolving deltas: 100% (26/26), done.


In [2]:
!pwd

/home/tsnanh/PycharmProjects/project-estimation-creation-using-retrieval-augmented-generator


In [3]:
import os

def convert_files_to_txts(src_dir, dst_dir):
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)

    for root, dirs, files in os.walk(src_dir):
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, src_dir)
            new_root = os.path.join(dst_dir, os.path.dirname(rel_path))
            os.makedirs(new_root, exist_ok=True)

            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    data = f.read()
            except:
                print(f'Failed to read the file: {file_path}')
                continue

            new_file_path = os.path.join(new_root, file + '.txt')
            with open(new_file_path, 'w', encoding='utf-8') as f:
                f.write(data)

convert_files_to_txts('./codebase', './converted_codebase')
            

In [2]:
src_dir = './converted_codebase'
loader = DirectoryLoader(src_dir, show_progress=True, loader_cls=TextLoader)
repo_files = loader.load()

print(f'Number of files loaded:> {len(repo_files)}')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
documents = text_splitter.split_documents(documents=repo_files)
print(f'Number of documents: {len(documents)}')

 47%|███████████████▌                 | 110/233 [00:00<00:00, 5033.37it/s]

Number of files loaded:> 110
Number of documents: 180





In [3]:
for doc in documents:
    old_path_with_txt_extension = doc.metadata['source']
    new_path_without_txt_extension = old_path_with_txt_extension.replace('.txt', '')
    doc.metadata.update({'source': new_path_without_txt_extension})

In [4]:
documents[10:15]

[Document(page_content='buildscript {\n    ext.kotlin_version = \'1.7.21\'\n    repositories {\n        google()\n        mavenCentral()\n    }\n\n    dependencies {\n        classpath \'com.android.tools.build:gradle:7.3.1\'\n        classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"\n    }\n}\n\nallprojects {\n    repositories {\n        google()\n        mavenCentral()\n    }\n}\n\nrootProject.buildDir = \'../build\'\nsubprojects {\n    project.buildDir = "${rootProject.buildDir}/${project.name}"\n}\nsubprojects {\n    project.evaluationDependsOn(\':app\')\n}\n\ntask clean(type: Delete) {\n    delete rootProject.buildDir\n}', metadata={'source': 'converted_codebase/android/build.gradle'}),
 Document(page_content='org.gradle.jvmargs=-Xmx1536M\nandroid.useAndroidX=true\nandroid.enableJetifier=true', metadata={'source': 'converted_codebase/android/gradle.properties'}),
 Document(page_content='include \':app\'\n\ndef localPropertiesFile = new File(rootProject.projectD

In [5]:
model_name = "mixtral"
model_kwargs = {'device': "cuda"}
encode_kwargs = {"normalize_embeddings": True }
embeddings = OllamaEmbeddings(model=model_name)

In [7]:
llm = Ollama(model='mixtral')

In [8]:
qdrant = Qdrant.from_documents(documents, embeddings,
                               path='./local_qdrant',
                               collection_name='source_code_documents')

In [11]:
def pretty_print_docs(documents):
    for doc in documents:
        print(doc.metadata)
        print(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ')
        print(doc.page_content)

In [15]:
query = 'repository'
found_docs = qdrant.similarity_search(query)
pretty_print_docs(found_docs)

{'source': 'converted_codebase/lib/screens/login/login.dart', '_id': '7e7980addba94fec9a1f6b624c0e8766', '_collection_name': 'source_code_documents'}
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
export 'login_screen.dart';
{'source': 'converted_codebase/lib/domain/domain.dart', '_id': '5378ad0200394e7a89342aeaf16210c1', '_collection_name': 'source_code_documents'}
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
export './repositories/city_repository.dart';
{'source': 'converted_codebase/lib/common/extensions/extensions.dart', '_id': 'bd85eaeadead42ecb45ee790fe48adad', '_collection_name': 'source_code_documents'}
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
export 'context.dart';
{'source': 'converted_codebase/ios/Runner.xcodeproj/project.pbxproj', '_id': '3a1c9b4a58a84652af85d05195633f73', '_collection_name': 'source_code_documents'}
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
4A8A4D2DDB56F31A39AA2C4E /* Pods-Run

In [28]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm,
 chain_type="stuff",
 retriever=qdrant.as_retriever(search_kwargs={"k":10}),
 return_source_documents=True)

In [32]:
response = qa.invoke("""List all the screens in this project""")
print(response)
print(response['result'])

{'query': 'List all the screens in this project', 'result': " Based on the provided code, here are the named screens/widgets I can find:\n\n1. `login_screen.dart`\n2. The screen or widget related to `HomeBloc` and `HomeCubit`, which might be the home screen of the application, but there's no specific name provided in the code.\n\nPlease note that this answer is based on the given context. There might be other screens in the project that are not included in the provided code snippets.", 'source_documents': [Document(page_content="export './repositories/city_repository.dart';", metadata={'source': 'converted_codebase/lib/domain/domain.dart', '_id': '5378ad0200394e7a89342aeaf16210c1', '_collection_name': 'source_code_documents'}), Document(page_content="export 'login_screen.dart';", metadata={'source': 'converted_codebase/lib/screens/login/login.dart', '_id': '7e7980addba94fec9a1f6b624c0e8766', '_collection_name': 'source_code_documents'}), Document(page_content='// Use enum for simple ev