In [1]:
import requests
import json
import os
import glob
import fnmatch
import pandas as pd
import markdown
from html import unescape
from bs4 import BeautifulSoup
from haystack import Document
from haystack.nodes import PreProcessor

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-22 14:10:41,459] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
# Collect from GTN

docs = []
directory_path = "../../../gtn-data/"

def read_md_file_1(path):
    with open(path) as f:
        content = f.read()
        return content

def read_md_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        md_content = file.read()
    return extract_plain_text_from_md(md_content)

def extract_plain_text_from_md(md_content):
    html_content = markdown.markdown(md_content)
    plain_text = ''.join(BeautifulSoup(html_content, "html.parser").findAll(text=True))
    return plain_text.strip()
    
#included_content = "/topics/statistics/tutorials/intro_deep_learning/"
included_content = "/topics/"

for root, dirs, files in os.walk(directory_path):
    for filename in files:
        if fnmatch.fnmatch(filename, '*.md'):
            path = os.path.join(root, filename)
            if included_content in path:
                s_path = path.split("/")[-3:]
                tutorial_name = "_".join(s_path)
                md_plain_text = read_md_file(path)
                pr_dict = {"content": md_plain_text, "meta": {"name": tutorial_name}}
                doc = Document.from_json(json.dumps(pr_dict))
                docs.append(doc)

  plain_text = ''.join(BeautifulSoup(html_content, "html.parser").findAll(text=True))


In [4]:
# Collect from PRs

# process PRs
for json_file in glob.glob("../out/github_pr_page_*.json"):
    with open(json_file, "r") as fin:
        doc_json = json.load(fin)
        for pr in doc_json:
            pr_text = pr["body"]
            if pr_text != None:
                useful_text_limit = pr_text.find("## How to test the changes")
                if useful_text_limit > 0:
                    pr_text = pr_text[:useful_text_limit].strip()
                    pr_dict = {"content": pr_text, "meta": {"name": pr["number"]}}
                    doc = Document.from_json(json.dumps(pr_dict))
                    docs.append(doc)

In [5]:
processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
    language="en",
)
preprocessed_docs = processor.process(docs)

Preprocessing:   0%|                                                                                                                         | 0/5018 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Document 765879b873e5da1f6844b2b004354251 is 15953 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing:  11%|████████████▌                                                                                                 | 572/5018 [00:02<00:15, 279.66docs/s]Document bd56adb5222bde92e6e6a446ac4125db is 18398 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000

In [6]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

In [7]:
document_store.write_documents(preprocessed_docs)

Updating BM25 representation...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 11966/11966 [00:00<00:00, 15995.83 docs/s]


In [8]:
from haystack import Pipeline
from haystack.nodes import BM25Retriever, PromptNode, PromptTemplate

In [9]:
retriever = BM25Retriever(document_store, top_k=4)

In [10]:
# a good Question Answering template, adapted for the instruction format
# (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
from haystack.nodes import PromptNode
from getpass import getpass

HF_TOKEN = getpass("<<>>")

qa_template = PromptTemplate(prompt=
  """<s>[INST] Using the information contained in the context, answer the question (using a maximum of two sentences).
  If the answer cannot be deduced from the context, answer \"I don't know.\"
  Context: {join(documents)};
  Question: {query}
  [/INST]""")

<<>> ········


In [19]:
#model_name = "NousResearch/Llama-2-7b-chat-hf"
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
prompt_node = PromptNode(
    model_name_or_path=model_name,
    api_key=HF_TOKEN,
    default_prompt_template=qa_template,
    max_length=5500,
    model_kwargs={"model_max_length":8000}
)

In [20]:
rag_pipeline = Pipeline()
rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [21]:
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())

In [22]:
print_answer(rag_pipeline.run(query="I would suggest installing the refseq_masher package. I checked earlier, and found it in the toolshed. \
Please, this package will help a lot."))

('The context provides instructions on how to create a pull request for a new '
 'recipe on bioconda. It also provides information on how to install external '
 'libraries in a conda environment and how to create a tool wrapper using '
 "Planemo. The context does not mention the refseq_masher package, so I don't "
 'know if it should be installed.')
