In [6]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

In [7]:
%pwd

'c:\\Users\\Suraj\\Desktop\\Python\\source-code-analysis-gen-ai\\research'

In [8]:
!mkdir test_dir

In [9]:
repo_path = "test_dir/"
repo = Repo.clone_from("https://github.com/suryanshp1/Gem-Price-Prediction-end-to-end-pipeline", to_path=repo_path)

In [10]:
loader = GenericLoader.from_filesystem(repo_path,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [11]:
documents = loader.load()

In [12]:
documents

[Document(metadata={'source': 'test_dir\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from flask import Flask, render_template, request\nfrom src.pipeline.prediction_pipeline import PredictPipeline, CustomData\nfrom src.exception.exception import CustomException\nfrom src.logger.logger import logging\nimport sys\n\napp=Flask(__name__)\n\n@app.route("/")\ndef home_page():\n    try:\n        return render_template(\'index.html\')\n    except Exception as e:\n        raise CustomException(e,sys)\n    \n@app.route("/predict", methods=[\'POST\', "GET"])\ndef predict_datapoint():\n    try:\n        if request.method=="GET":\n            return render_template(\'form.html\')\n        else:\n            data=CustomData(\n                carat=float(request.form.get("carat")),\n                depth=float(request.form.get("depth")),\n                table=float(request.form.get("table")),\n                x=float(request.form.get("x")),\n                y=float(request.form.

In [13]:
len(documents)

24

In [14]:
documents[0]

Document(metadata={'source': 'test_dir\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from flask import Flask, render_template, request\nfrom src.pipeline.prediction_pipeline import PredictPipeline, CustomData\nfrom src.exception.exception import CustomException\nfrom src.logger.logger import logging\nimport sys\n\napp=Flask(__name__)\n\n@app.route("/")\ndef home_page():\n    try:\n        return render_template(\'index.html\')\n    except Exception as e:\n        raise CustomException(e,sys)\n    \n@app.route("/predict", methods=[\'POST\', "GET"])\ndef predict_datapoint():\n    try:\n        if request.method=="GET":\n            return render_template(\'form.html\')\n        else:\n            data=CustomData(\n                carat=float(request.form.get("carat")),\n                depth=float(request.form.get("depth")),\n                table=float(request.form.get("table")),\n                x=float(request.form.get("x")),\n                y=float(request.form.g

In [15]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 500,
                                                             chunk_overlap = 20)

In [16]:
texts = documents_splitter.split_documents(documents)

In [17]:
texts

[Document(metadata={'source': 'test_dir\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from flask import Flask, render_template, request\nfrom src.pipeline.prediction_pipeline import PredictPipeline, CustomData\nfrom src.exception.exception import CustomException\nfrom src.logger.logger import logging\nimport sys\n\napp=Flask(__name__)\n\n@app.route("/")\ndef home_page():\n    try:\n        return render_template(\'index.html\')\n    except Exception as e:\n        raise CustomException(e,sys)\n    \n@app.route("/predict", methods=[\'POST\', "GET"])'),
 Document(metadata={'source': 'test_dir\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='def predict_datapoint():\n    try:\n        if request.method=="GET":\n            return render_template(\'form.html\')\n        else:\n            data=CustomData(\n                carat=float(request.form.get("carat")),\n                depth=float(request.form.get("depth")),\n                table=float(request

In [18]:
len(texts)

79

In [19]:
from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [21]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./db')

In [22]:
vectordb.persist()

  vectordb.persist()


In [28]:
llm = ChatOpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=GROQ_API_KEY,
    model_name="llama-3.1-70b-versatile"
)

In [29]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [30]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":8}), memory=memory)

In [31]:
question = "what is data_transformation function ?"

In [32]:
result = qa(question)

print(result["answer"])

The `data_transformation` function is not explicitly defined in the provided code snippet. However, there is a class named `DataTransformation` and a method `get_data_transformation` which seems to be related to data transformation.

The `get_data_transformation` method is used to perform data transformation. It creates a `ColumnTransformer` object that applies different transformations to different columns in the data. 

The `ColumnTransformer` is composed of two pipelines:
- `num_pipeline` for numerical columns (`carat`, `depth`, `table`, `x`, `y`, `z`)
- `cat_pipeline` for categorical columns (`cut`, `color`, `clarity`)

The transformed data is then returned by the `get_data_transformation` method.

It's likely that the `data_transformation` function is supposed to call this `get_data_transformation` method or perform a similar task. However, without the actual definition of `data_transformation`, it's impossible to be certain.
