In [1]:
import requests
import json

In [6]:
offset = 21
n_pages = 30
path = "../out/"
headers = {'Accept': 'application/json', 'Authorization': 'TOK:<<>>'}
prs = ""
for i in range(n_pages):
    i += offset
    print(i)
    page_path = "https://api.github.com/repos/galaxyproject/galaxy/pulls?page={}&state=all&per_page=100".format(i)
    r = requests.get(page_path, allow_redirects=True, headers=headers)
    loaded_string = json.loads(r.content.decode("utf-8"))
    if prs == "":
        prs = loaded_string
    else:
        prs.extend(loaded_string)
    if i % 5 == 0 and i > 0:
        print("Pages finished: {}".format(i))
print("Writing PRs to file...")
open(path + "github_pr_page_{}-{}.json".format(offset, offset+n_pages), 'w').write(json.dumps(prs))

21
22
23
24
25
Pages finished: 25
26
27
28
29
30
Pages finished: 30
31
32
33
34
35
Pages finished: 35
36
37
38
39
40
Pages finished: 40
41
42
43
44
45
Pages finished: 45
46
47
48
49
50
Pages finished: 50
Writing PRs to file...


58365621

In [18]:
# create CSV file for all PRs
import glob,json
from haystack import Document
from haystack.nodes import PreProcessor

docs = []

print("Creating Haystack document store...")

for json_file in glob.glob("../out/github_pr_page_*.json"):
    with open(json_file, "r") as fin:
        doc_json = json.load(fin)
        for pr in doc_json:
            pr_text = pr["body"]
            if pr_text != None:
                useful_text_limit = pr_text.find("## How to test the changes")
                if useful_text_limit > 0:
                    pr_text = pr_text[:useful_text_limit].strip()
                    pr_dict = {"content": pr_text, "meta": {"name": pr["number"]}}
                    doc = Document.from_json(json.dumps(pr_dict))
                    docs.append(doc)

Creating Haystack document store...


In [19]:
len(docs)

3654

In [20]:
processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
    language="en",
)
preprocessed_docs = processor.process(docs)

Preprocessing:   8%|████████▎                                                                                                    | 278/3654 [00:00<00:02, 1417.47docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing:  55%|███████████████████████████████████████████████████████████▎                                                | 2005/3654 [00:01<00:01, 1104.01docs/s]Document a54b46bd20f7cd243598211e91981972 is 27560 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 5298a0ed77b8b25aea6acf3858f36223 is 17560 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000

In [21]:
preprocessed_docs[0]

<Document: {'content': 'Follow up to #17553\nIgnore autogenerated CSS from the visualization build', 'content_type': 'text', 'score': None, 'meta': {'name': 17559, '_split_id': 0}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '507bc9e8922085954a7928cb73983ec8'}>

In [23]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

In [24]:
document_store.write_documents(preprocessed_docs)

Updating BM25 representation...: 100%|████████████████████████████████████████████████████████████████████████████████████████| 3690/3690 [00:00<00:00, 22446.77 docs/s]


In [25]:
from haystack import Pipeline
from haystack.nodes import BM25Retriever, PromptNode, PromptTemplate

In [26]:
retriever = BM25Retriever(document_store, top_k=4)

In [29]:
# a good Question Answering template, adapted for the instruction format
# (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
from haystack.nodes import PromptNode
from getpass import getpass

HF_TOKEN = getpass("<<>>")

qa_template = PromptTemplate(prompt=
  """<s>[INST] Using the information contained in the context, answer the question (using a maximum of two sentences).
  If the answer cannot be deduced from the context, answer \"I don't know.\"
  Context: {join(documents)};
  Question: {query}
  [/INST]""")

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 ········


In [30]:
prompt_node = PromptNode(model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1",
                         api_key=HF_TOKEN,
                         default_prompt_template=qa_template,
                         max_length=5500,
                         model_kwargs={"model_max_length":8000})

In [31]:
rag_pipeline = Pipeline()
rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [32]:
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())

In [34]:
print_answer(rag_pipeline.run(query="How is autogenerated CSS created in Galaxy server?"))

"I don't know."
