In [1]:
import os
import torch

In [2]:
#save model to HuggingFace hub
#https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/mixtral-8x7b-for-web-qa.ipynb

In [3]:
# Helpful links

# https://docs.haystack.deepset.ai/docs/custom-components
# https://docs.haystack.deepset.ai/docs/promptbuilder
# https://github.com/deepset-ai/haystack/blob/main/examples/retrievers/in_memory_bm25_documentsearch.py
# https://github.com/deepset-ai/haystack-tutorials/tree/main/tutorials

In [4]:
import requests
import json
import os
import glob
import fnmatch
import pandas as pd
import markdown
from html import unescape
from bs4 import BeautifulSoup
import secrets

from haystack import Document, Pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders.prompt_builder import PromptBuilder

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Collect from GTN

docs = []
directory_path = "../gtn-data/"

def read_md_file_1(path):
    with open(path) as f:
        content = f.read()
        return content

def read_md_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        md_content = file.read()
    return extract_plain_text_from_md(md_content)

def extract_plain_text_from_md(md_content):
    html_content = markdown.markdown(md_content)
    plain_text = ''.join(BeautifulSoup(html_content, "html.parser").findAll(text=True))
    return plain_text.strip()

def generate_hex_id(length):
    random_bytes = secrets.token_bytes(length // 2)
    return random_bytes.hex()
    
#included_content = "/topics/statistics/tutorials/intro_deep_learning/"

included_content = "/topics/"

for root, dirs, files in os.walk(directory_path):
    for filename in files:
        if fnmatch.fnmatch(filename, '*.md'):
            path = os.path.join(root, filename)
            if included_content in path:
                s_path = path.split("/")[-3:]
                tutorial_name = "_".join(s_path)
                md_plain_text = read_md_file(path)
                doc = Document(content=md_plain_text, id=generate_hex_id(10))
                docs.append(doc)

  plain_text = ''.join(BeautifulSoup(html_content, "html.parser").findAll(text=True))


In [17]:
len(docs)

1390

In [6]:
# Collect from PRs
# process PRs
for json_file in glob.glob("../out/github_pr_page_*.json"):
    with open(json_file, "r") as fin:
        doc_json = json.load(fin)
        for pr in doc_json:
            pr_text = pr["body"]
            if pr_text != None:
                useful_text_limit = pr_text.find("## How to test the changes")
                if useful_text_limit > 0:
                    pr_text = pr_text[:useful_text_limit].strip()
                    doc = Document(content=pr_text, id=generate_hex_id(10))
                    docs.append(doc)

In [7]:
docs[:5]

[Document(id=bddc50e887, content: 'layout: topic
 topic_name: metabolomics'),
 Document(id=7725f8cb36, content: 'Topic name
 Please refer to the CONTRIBUTING.md before adding or updating any material'),
 Document(id=b965667306, content: 'layout: tutorial_hands_on
 title: 'Mass spectrometry imaging: Finding differential analytes'
 zenodo_l...'),
 Document(id=2e0e2d59d7, content: 'layout: workflow-list'),
 Document(id=735f04b2df, content: 'layout: faq-page')]

5044

In [10]:
from haystack import Document, Pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders.prompt_builder import PromptBuilder


document_store = InMemoryDocumentStore()
doc_retriever = InMemoryBM25Retriever(document_store=document_store, top_k=5)
document_store.write_documents(docs)

5044

In [38]:
doc_retriever.run(query="Bioinformatics", top_k=1)

Ranking by BM25...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5044/5044 [00:00<00:00, 9606.80 docs/s]


{'documents': [Document(id=48f39e54e5, content: 'Teaching and Hosting Galaxy trainings
  Galaxy is a great solution to train the bioinformatics concept...', score: 8.223189013271442)]}

In [14]:
qa_template = """<s>[INST] Using the information contained in the context, answer the question (using a maximum of two sentences).
  If the answer cannot be deduced from the context, answer \"I don't know.\"
  Context: {join(documents)};
  Question: {query}
  [/INST]"""


prompt_builder = PromptBuilder(template=qa_template)

In [33]:
from getpass import getpass

HF_TOKEN = getpass("<<>>")

<<>> ········


In [34]:
from haystack.components.generators import HuggingFaceTGIGenerator
from haystack.utils import Secret

model_name = "mistralai/Mistral-7B-Instruct-v0.1" #"mistralai/Mistral-7B-Instruct-v0.1"

llm = HuggingFaceTGIGenerator("mistralai/Mixtral-8x7B-Instruct-v0.1", token=Secret.from_token(HF_TOKEN))
llm.warm_up()

OSError: mistralai/Mixtral-8x7B-Instruct-v0.1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

NameError: name 'PromptNode' is not defined

In [None]:
rag_pipeline = Pipeline()
rag_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
rag_pipeline.add_component(instance=retriever, name="retriever", inputs=["Query"])
rag_pipeline.add_component(instance=prompt_node, name="prompt_node", inputs=["retriever"])

In [None]:
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())

In [None]:
print_answer(rag_pipeline.run(query="I would suggest installing the refseq_masher package. I checked earlier, and found it in the toolshed. \
Please, this package will help a lot."))