In [1]:
# LangChain

from langchain.schema import HumanMessage, SystemMessage
from langchain.schema.document import Document

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import Language

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

import time
from typing import List
from pydantic import BaseModel


https://python.langchain.com/docs/modules/data_connection/document_transformers/code_splitter/

In [2]:
import requests
import nbformat
import json

# Extracts the python code from an .ipynb file from github
def extract_python_code_from_ipynb(github_url,cell_type = "code"):
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    notebook_content = response.text

    notebook = nbformat.reads(notebook_content, as_version=nbformat.NO_CONVERT)

    python_code = None

    for cell in notebook.cells:
        if cell.cell_type == cell_type:
          if not python_code:
            python_code = cell.source
          else:
            python_code += "\n" + cell.source

    return python_code

# Extracts the python code from an .py file from github
def extract_python_code_from_py(github_url):
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    python_code = response.text

    return python_code

with open('./code_files_urls.txt') as f:
    code_files_urls = f.read().splitlines()

In [3]:
code_strings = []

for i in range(0, len (code_files_urls)):
    if code_files_urls[i].endswith(".ipynb"):
        content = extract_python_code_from_ipynb(code_files_urls[i],"code")
        doc = Document(page_content=content, metadata= {"url": code_files_urls[i], "file_index":i})
        code_strings.append(doc)
code_strings[0]

Document(page_content='!pip install huggingface-hub -Uqq\n!pip install -Uqq sagemaker\nfrom huggingface_hub import snapshot_download\nfrom pathlib import Path\n\nlocal_model_path = Path("./bge-m3-model")\nlocal_model_path.mkdir(exist_ok=True)\nmodel_name = "BAAI/bge-m3"\ncommit_hash = "4277867103fc67328e2033176de4387b85e9960f"\nsnapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)\n!pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple -Uqq\nfrom modelscope.hub.snapshot_download import snapshot_download\nfrom pathlib import Path\n\nlocal_model_path = Path("./bge-zh-model")\n\nlocal_model_path.mkdir(exist_ok=True)\nmodel_name = "Xorbits/bge-large-zh-v1.5"\ncommit_hash = "v0.0.1"\n\nsnapshot_download(model_name, revision=commit_hash, cache_dir=local_model_path)\nimport sagemaker\nfrom sagemaker import image_uris\nimport boto3\nimport os\nimport time\nimport json\n\nrole = sagemaker.get_execution_role()  # execution role for the endpoint\nses

In [4]:
import requests, time

#Crawls a GitHub repository and returns a list of all ipynb files in the repository
def crawl_github_repo(url,is_sub_dir,access_token = f"{GITHUB_TOKEN}"):

    ignore_list = ['__init__.py']

    if not is_sub_dir:
        api_url = f"https://api.github.com/repos/{url}/contents"
    else:
        api_url = url

    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": f"Bearer {access_token}" 
                   }

    response = requests.get(api_url, headers=headers)
    response.raise_for_status()  # Check for any request errors

    files = []

    contents = response.json()

    for item in contents:
        if item['type'] == 'file' and item['name'] not in ignore_list and (item['name'].endswith('.py') or item['name'].endswith('.ipynb')):
            files.append(item['html_url'])
        elif item['type'] == 'dir' and not item['name'].startswith("."):
            sub_files = crawl_github_repo(item['url'],True)
            time.sleep(.1)
            files.extend(sub_files)

    return files

NameError: name 'GITHUB_TOKEN' is not defined

In [9]:
from langchain.embeddings import BedrockEmbeddings
from langchain_community.chat_models import BedrockChat
from langchain_core.messages import HumanMessage,AIMessage
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder,HumanMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser


In [10]:
embeddings = BedrockEmbeddings(
    credentials_profile_name="default", region_name="us-east-1"
)

In [11]:
len(embeddings.embed_query("This is a content of the document"))



1536

In [12]:
# Chunk code strings
text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,chunk_size=2000, chunk_overlap=200
)
texts = text_splitter.split_documents(code_strings)

In [80]:
from langchain_community.vectorstores import Chroma
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough


In [14]:
db = Chroma.from_documents(texts, embeddings)



In [15]:
retriever = db.as_retriever()



In [69]:
rag_template = \
"""you are a professional programmer,
please use the below reference code to response to user's request. 
<reference>
{context}
</reference>

The user question can be a code completion request.
Here is user's request:
{question}
"""

In [70]:

llm = BedrockChat(model_id="anthropic.claude-3-sonnet-20240229-v1:0",
                  model_kwargs={"temperature": 0.2,
                                "top_k":250,
                                "max_tokens": 1024,
                                "top_p":0.95,
                                # "stop_sequences":['</response>']
                               },
                  streaming=False,callbacks=[StreamingStdOutCallbackHandler()])

In [71]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [72]:
# retriever.get_relevant_documents('how deploy baichuan2 in sagemaker?')

In [73]:
prompt= ChatPromptTemplate.from_template(rag_template)


In [83]:
chain = ({"context":retriever | format_docs,"question":RunnablePassthrough()}) | prompt |llm|StrOutputParser()

In [84]:
chain.invoke('how deploy baichuan2 in sagemaker?')

'To deploy the Baichuan2 model on SageMaker, you can follow these steps:\n\n1. **Download the model checkpoint**\n```python\nfrom huggingface_hub import snapshot_download\nfrom pathlib import Path\n\nlocal_model_path_name = "./CSDC_buffer_baichuan2_13B_rag_4bits"\nmodel_hf_name = "csdc-atl/buffer-baichuan2-13B-rag-4bits"\nmodel_name = model_hf_name.split(\'/\')[-1]\n\nlocal_model_path = Path(local_model_path_name)\nlocal_model_path.mkdir(exist_ok=True)\ncommit_hash = \'107d6ef2ab9f77efc5d53ddab3d4a1621e531627\'\nsnapshot_download(repo_id=model_hf_name, revision=commit_hash, cache_dir=local_model_path)\n```\n\n2. **Upload the model to S3**\n```python\nimport sagemaker\nsess = sagemaker.session.Session()\nbucket = sess.default_bucket()\nregion = sess._region_name\n\ns3_model_prefix = f"aigc-llm-models/{model_name}"\nmodel_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]\n\n!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}\n```\n\n3. **Set up the