# Indexing

In [260]:
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.document_loaders import GitLoader
import re
from langchain.schema import Document
import copy


## Load the Repo

In [360]:
loader = GitLoader(
    clone_url="https://bitbucket.org/pwrlab/data-ingestion",
    repo_path="./src/",
    branch="dev",
    file_filter=lambda file_path: file_path.endswith(".py"),
)

In [361]:
data = loader.load()

In [362]:
len(data)

58

## Splitting the Code
Split the code in a context aware way and add summary data to the metadata.

In [364]:
from langchain.chat_models import ChatOpenAI
import os

In [365]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [366]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [367]:
llm = ChatOpenAI(model='gpt-4', temperature=0)

In [368]:
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [369]:
system_message = '''Summarize code contained in the document in a single sentence with no more than 25 words. 
'''

In [370]:
import re

def parse_content(content):
    parts = re.split(r'(?=def )', content)
    
    imports = []
    variables = []
    functions = []
    
    for part in parts:
        part = part.strip()
        if part.startswith('from ') or part.startswith('import '):
            imports.append(part)
        elif part.startswith('def ')or part.startswith('class '):
            functions.append(part)
        elif part:
            variables.append(part)
    
    return {
        'imports': imports,
        'variables': variables,
        'functions': functions
    }

In [371]:
import copy

word_count = 1000
documents = []

def generate_summary(content, metadata):
    messages = [
        SystemMessage(content=system_message),
        HumanMessage(content=f'{content}, {metadata}')
    ]
    aimessage = llm(messages)
    return aimessage.content

def create_document(content, metadata, summary):
    metadata_copy = copy.deepcopy(metadata)  # Create a deep copy of metadata
    metadata_copy['summary'] = summary  # Set the summary on the copied metadata
    doc = Document(page_content=content, metadata=metadata_copy)  # Use the copied metadata
    return doc

for file in data:
    if len(file.page_content.split()) > word_count:
        doc_dict = parse_content(file.page_content)
        for section, content_list in doc_dict.items():
            for content in content_list:
                summary = generate_summary(content, file.metadata)
                doc = create_document(content, file.metadata, summary)
                documents.append(doc)  
    else:
        summary = generate_summary(file.page_content, file.metadata)
        doc = create_document(file.page_content, file.metadata, summary)
        documents.append(doc) 

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-hIGwihMGYwCXLfkEAXet778V on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-hIGwihMGYwCXLfkEAXet778V on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-hIGwihMGYwCXLfkEAXet778V on tokens per min. Limit: 10000 / m

In [372]:
len(documents)

190

In [374]:
documents[140]

Document(page_content="def _uv_dose(workout, samples):\n    for field in ['corrected', 'clearsky']:\n        workout[f'uv_dose_{field}'] = (samples[f'uv_intensity_{field}'] * samples['elapsed_time_delta']).sum()\n        workout[f'uv_dose_{field}'].fillna(0)", metadata={'source': 'domain/silver/workouts.py', 'file_path': 'domain/silver/workouts.py', 'file_name': 'workouts.py', 'file_type': '.py', 'summary': "The code calculates the UV dose for 'corrected' and 'clearsky' fields in a workout by multiplying UV intensity with elapsed time."})

## Embed and Load in a Vector Store

In [379]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [377]:
embeddings = OpenAIEmbeddings()

In [380]:
try:
    vectordb = FAISS.from_documents(documents, embeddings)
except ConnectionError as e:
    print(f"Unexpected status code: {e.status_code}, with response body: {e.response_body}")

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).


In [381]:
vectordb.save_local("faiss_index")

In [382]:
# Example Search
query = "How is UV Dose calculated"
docs = vectordb.similarity_search(query)

In [383]:
len(docs)

4

In [385]:
docs[0]

Document(page_content="def _uv_dose(workout, samples):\n    for field in ['corrected', 'clearsky']:\n        workout[f'uv_dose_{field}'] = (samples[f'uv_intensity_{field}'] * samples['elapsed_time_delta']).sum()\n        workout[f'uv_dose_{field}'].fillna(0)", metadata={'source': 'domain/silver/workouts.py', 'file_path': 'domain/silver/workouts.py', 'file_name': 'workouts.py', 'file_type': '.py', 'summary': "The code calculates the UV dose for 'corrected' and 'clearsky' fields in a workout by multiplying UV intensity with elapsed time."})

In [386]:
docs[1]

Document(page_content="def _average_uvi(workout):\n    for field in ['corrected', 'clearsky']:\n        workout[f'average_uvi_{field}'] = (workout[f'uv_dose_{field}'] * 40) / workout['elapsed_duration']\n        workout[f'average_uvi_{field}'] = workout[f'average_uvi_{field}'].replace([np.inf, -np.inf], np.nan)", metadata={'source': 'domain/silver/workouts.py', 'file_path': 'domain/silver/workouts.py', 'file_name': 'workouts.py', 'file_type': '.py', 'summary': "The code calculates the average UV index for two fields ('corrected', 'clearsky') in a workout dataset and replaces infinite values with NaN."})

In [387]:
docs[2]

Document(page_content="def _running_uv_dose(history, snapshots, start):\n    if history.empty:\n        return\n\n    # Put workouts into daily buckets and aggregate the number of workouts per day.\n    history['day'] = history['local_start_time'].dt.date\n    workouts_by_day = history.groupby('day').agg({\n        'uv_dose_corrected': 'sum',\n    })\n    start = min(\n        history['local_start_time'].min(),\n        start - timedelta(weeks=26)\n    )\n    workouts_by_day = workouts_by_day.reindex(\n        pd.date_range(start, datetime.now(), normalize=True)\n    )\n    workouts_by_day['uv_dose_corrected'].fillna(value=0, inplace=True)\n    for period in (1, 7, 28):\n        workouts_by_day[f'running_uv_dose_{period}d'] = \\\n            workouts_by_day['uv_dose_corrected'].rolling(window=f'{period} D', min_periods=0).sum()\n        # Merge running_uv_dose back into snapshots\n    return snapshots.merge(\n        workouts_by_day, how='left', left_on='timestamp', right_index=True, s

In [388]:
docs[3]

Document(page_content="def _uv_intensity(samples):\n    samples['uv_intensity_clearsky'] = samples['clear_sky_uv_index'] / 40\n    samples['uv_intensity_corrected'] = samples['corrected_uv_index'] / 40", metadata={'source': 'domain/silver/workouts.py', 'file_path': 'domain/silver/workouts.py', 'file_name': 'workouts.py', 'file_type': '.py', 'summary': 'The code calculates the UV intensity for clear sky and corrected UV index by dividing respective values by 40.'})