In [1]:
import os
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List

### Snippets with LLM


In [94]:
api_key = os.getenv("BLABLADOR_API_KEY")
# client = instructor.patch(OpenAI(api_key=api_key))
client = instructor.patch(
    OpenAI(
        api_key=api_key,
        base_url="https://helmholtz-blablador.fz-juelich.de:8000/v1/",
    ),
    mode=instructor.Mode.MD_JSON,
)

In [91]:
# Extract from the title and abstract ONLY directly relevant sentences or phrases relevant to the given question. If no such sentences or phrases found, return an empty list.
SNIPPET_TEMPLATE = """
Here are 3 examples:
[Title]: Rethinking ramoplanin: the role of substrate binding in inhibition of peptidoglycan biosynthesis
[Abstract]: Ramoplanin is a cyclicdepsipeptide antibiotic that inhibits peptidoglycan biosynthesis. It was proposed in 1990 to block the MurG step of peptidoglycan synthesis by binding to the substrate of MurG, Lipid I. The proposed mechanism of MurG inhibition has become widely accepted even though it was never directly tested. In this paper, we disprove the accepted mechanism for how ramoplanin functions, and we present an alternative mechanism. This work has implications for the design of ramoplanin derivatives and may influence how other proposed substrate binding antibiotics are studied.
[Question]: Which was the first adeno-associated virus vector gene therapy product approved in the United States?
[Extracted]:
Title sentences: [empty list] (no sentences or phrases that directly answer the question)
Abstract sentences: [empty list] (no sentences or phrases that directly answer the question)
******************
[Title]: Rethinking ramoplanin: the role of substrate binding in inhibition of peptidoglycan biosynthesis
[Abstract]: Ramoplanin is a cyclicdepsipeptide antibiotic that inhibits peptidoglycan biosynthesis. It was proposed in 1990 to block the MurG step of peptidoglycan synthesis by binding to the substrate of MurG, Lipid I. The proposed mechanism of MurG inhibition has become widely accepted even though it was never directly tested. In this paper, we disprove the accepted mechanism for how ramoplanin functions, and we present an alternative mechanism. This work has implications for the design of ramoplanin derivatives and may influence how other proposed substrate binding antibiotics are studied.
[Question]: Which antibiotics target peptidoglycan biosynthesis?
[Extracted]: 
Title sentences: ["Rethinking ramoplanin: the role of substrate binding in inhibition of peptidoglycan biosynthesis."]
Abstract sentences: ["Ramoplanin is a cyclicdepsipeptide antibiotic that inhibits peptidoglycan biosynthesis."]
******************
[Title]: Mycobacterium Avium Complex (MAC) Lung Disease in Two Inner City Community Hospitals: Recognition, Prevalence, Co-Infection with Mycobacterium Tuberculosis (MTB) and Pulmonary Function (PF) Improvements After Treatment.
[Abstract]: Over 4 years, we evaluated patients who had positive MAC cultures, MAC infection and coinfection with MTB. Lung disease was related/likely related to MAC in 21 patients (50%) and not related in 21 (50%). In patients with MAC-related lung disease, the primary physician did not consider the diagnosis except when that physician was a pulmonologist. Half of those with MAC-related lung disease were smokers, white and US-born. There were 12 immunocompetent patients with MTB and NTM cultures. Eleven were non-white and all were foreign-born. Presentation and clinical course were consistent with MTB. All 8 patients with abnormal PF improved. The prevalence of MAC lung infection in two inner city hospitals was four times higher than that of TB. The indication for treatment of MAC infection should also rely heavily on clinical and radiological evidence when there is only one positive sputum culture. The diagnosis was considered only when the admitting physician was a pulmonologist. Most patients with combined infection were clinically consistent with MTB and responded to anti MTB treatment alone. Treatment with anti-MAC therapy improved PF in those patients whose PF was abnormal to begin with.
[Question]: Is Mycobacterium avium less susceptible to antibiotics than Mycobacterium tuberculosis?
[Extracted]:
Title sentences: [empty list] (no sentences or phrases that directly answer the question) 
Abstract sentences: ["The prevalence of MAC lung infection in two inner city hospitals was four times higher than that of TB.", "Most patients with combined infection were clinically consistent with MTB and responded to anti MTB treatment alone."]
******************

Here is the data:

[Title]: {title} 
[Abstract]: {abstract}
[Question]: {question}
[Extracted]:

"""

In [801]:
# SNIPPET_TEMPLATE = """
# Here is the data:
# ******************
# [Title]: {title}
# [Abstract]: {abstract}
# Extract from the [Title] and [Abstract] complete sentences or parts of sentences (phrases) that contain the answer to the question: ```{question}```.
# If no such sentences or phrases found, return an empty list.
# """

In [96]:
SNIPPET_TEMPLATE = """
Question: {question}
Title: {title}
Abstract: {abstract}
"""

In [604]:
# SNIPPET_TEMPLATE = """
# Extract from the title and abstract complete sentences or parts of sentences (phrases) that contain the answer to the given question. If no relevant sentences or phrases found, return an empty list.

# [Begin Examples]
# ******************
# [Title]: Rethinking ramoplanin: the role of substrate binding in inhibition of peptidoglycan biosynthesis
# [Abstract]: Ramoplanin is a cyclicdepsipeptide antibiotic that inhibits peptidoglycan biosynthesis. It was proposed in 1990 to block the MurG step of peptidoglycan synthesis by binding to the substrate of MurG, Lipid I. The proposed mechanism of MurG inhibition has become widely accepted even though it was never directly tested. In this paper, we disprove the accepted mechanism for how ramoplanin functions, and we present an alternative mechanism. This work has implications for the design of ramoplanin derivatives and may influence how other proposed substrate binding antibiotics are studied.
# [Extracted]:
# Title sentences that answer the question Which antibiotics target peptidoglycan biosynthesis?: ["Rethinking ramoplanin: the role of substrate binding in inhibition of peptidoglycan biosynthesis."]
# Abstract sentences that answer the question: ["Ramoplanin is a cyclicdepsipeptide antibiotic that inhibits peptidoglycan biosynthesis."]
# ******************
# [Title]: Mycobacterium Avium Complex (MAC) Lung Disease in Two Inner City Community Hospitals: Recognition, Prevalence, Co-Infection with Mycobacterium Tuberculosis (MTB) and Pulmonary Function (PF) Improvements After Treatment.
# [Abstract]: Over 4 years, we evaluated patients who had positive MAC cultures, MAC infection and coinfection with MTB. Lung disease was related/likely related to MAC in 21 patients (50%) and not related in 21 (50%). In patients with MAC-related lung disease, the primary physician did not consider the diagnosis except when that physician was a pulmonologist. Half of those with MAC-related lung disease were smokers, white and US-born. There were 12 immunocompetent patients with MTB and NTM cultures. Eleven were non-white and all were foreign-born. Presentation and clinical course were consistent with MTB. All 8 patients with abnormal PF improved. The prevalence of MAC lung infection in two inner city hospitals was four times higher than that of TB. The indication for treatment of MAC infection should also rely heavily on clinical and radiological evidence when there is only one positive sputum culture. The diagnosis was considered only when the admitting physician was a pulmonologist. Most patients with combined infection were clinically consistent with MTB and responded to anti MTB treatment alone. Treatment with anti-MAC therapy improved PF in those patients whose PF was abnormal to begin with.
# [Extracted]:
# Title sentences that answer the question Is Mycobacterium avium less susceptible to antibiotics than Mycobacterium tuberculosis?: []
# Abstract sentences the question Is Mycobacterium avium less susceptible to antibiotics than Mycobacterium tuberculosis?: ["The prevalence of MAC lung infection in two inner city hospitals was four times higher than that of TB.", "Most patients with combined infection were clinically consistent with MTB and responded to anti MTB treatment alone."]
# ******************
# [Title]: Rethinking ramoplanin: the role of substrate binding in inhibition of peptidoglycan biosynthesis
# [Abstract]: Ramoplanin is a cyclicdepsipeptide antibiotic that inhibits peptidoglycan biosynthesis. It was proposed in 1990 to block the MurG step of peptidoglycan synthesis by binding to the substrate of MurG, Lipid I. The proposed mechanism of MurG inhibition has become widely accepted even though it was never directly tested. In this paper, we disprove the accepted mechanism for how ramoplanin functions, and we present an alternative mechanism. This work has implications for the design of ramoplanin derivatives and may influence how other proposed substrate binding antibiotics are studied.
# [Extracted]:
# Title sentences that answer the question Which was the first adeno-associated virus vector gene therapy product approved in the United States?: []
# Abstract sentences the question Which was the first adeno-associated virus vector gene therapy product approved in the United States?: []
# ******************
# [End Examples]

# Here is the data:
# ******************
# [Title]: {title}
# [Abstract]: {abstract}
# Title sentences that answer the question {question}:
# Abstract sentences the question {question}:
# """

In [9]:
class Snippets(BaseModel):
    title_sentences: List = Field(
        ...,
        description="ONLY sentences or phrases from title that directly answer the given question, or empty list if no answer present",
    )
    abstract_sentences: List = Field(
        ...,
        description="ONLY sentences or phrases from abstract that directly answer the question, or empty list if no answer present",
    )
    score: float = Field(
        ..., description="how confident are you: score between 0 and 1"
    )
    chain_of_thought: str = Field(
        ...,
        description="Think step by step to make a good decision. Are there extracted sentences that directly answer the question?",
    )


class ExtractedSnippets(BaseModel):
    snippets: Snippets

In [473]:
# class Snippets(BaseModel):
#     title_sentences: List
#     abstract_sentences: List


# class ExtractedSnippets(BaseModel):
#     snippets: Snippets

In [95]:
def get_snippets(question: str, title: str, abstract: str) -> ExtractedSnippets:
    submission = SNIPPET_TEMPLATE.format(
        question=question, title=title, abstract=abstract
    )
    resp = client.chat.completions.create(
        # model="gpt-3.5-turbo-0613",
        model="Mixtral-8x7B-Instruct-v0.1",
        # model="gpt-3.5-turbo",
        # model="Mistral-7B-Instruct-v0.2",
        messages=[
            {
                "role": "system",
                "content": "You are a world class system to extract relevant sentences from titles and abstracts answering questions",
            },
            {
                "role": "user",
                "content": "Extract from the title and abstract ONLY sentences of phrases that directly answer the question",
            },
            {
                "role": "user",
                "content": submission,
            },
        ],
        temperature=0,
        response_model=Snippets,
        max_tokens=1000,
    )
    return resp

In [893]:
# question = "Is Mycobacterium avium less susceptible to antibiotics than Mycobacterium tuberculosis?"
# question = "Is London a safe city to live in?"
question = (
    "What is the mechanism by which HIV-1-encoded Vif protein allows virus replication?"
)
# question = "Please list 2 human diseases caused by a coronavirus."
title = "HIV-1 subtype variability in Vif derived from molecular clones affects APOBEC3G-mediated host restriction"
abstract = "Background: The host protein APOBEC3G (A3G) can limit HIV-1 replication. Its protective effect is overcome by the HIV-1 'viral infectivity factor' (Vif), which targets A3G for proteosomal degradation. Although Vif is considered to be essential for HIV-1 replication, the effect of Vif variability among commonly used HIV-1 molecular clones of different genetic backgrounds on viral infectiousness and pathogenesis has not been fully determined. Methods: We cloned the intact Vif coding regions of available molecular clones of different subtypes into expression vectors. Δvif full-length HIV-1 clonal variants were generated from corresponding subtype-specific full-length molecular clones. Replication-competent viruses were produced in 293T cells in the presence or absence of A3G, with Vif being supplied by the full-length HIV-1 clone or in trans. The extent of A3G-mediated restriction was then determined in a viral replication assay using a reporter cell line. Results and conclusions: In the absence of A3G, Vif subtype origin did not impact viral replication. In the presence of A3G the subtype origin of Vif had a differential effect on viral replication. Vif derived from a subtype C molecular clone was less effective at overcoming A3G-mediated inhibition than Vif derived from either subtype B or CRF02_AG molecular clones."

In [11]:
question = "Please list 2 human diseases caused by a coronavirus."
title = "Host Factors in Coronavirus Replication"
abstract = "Coronaviruses are pathogens with a serious impact on human and animal health. They mostly cause enteric or respiratory disease, which can be severe and life threatening, e.g., in the case of the zoonotic coronaviruses causing severe acute respiratory syndrome (SARS) and Middle East Respiratory Syndrome (MERS) in humans. Despite the economic and societal impact of such coronavirus infections, and the likelihood of future outbreaks of additional pathogenic coronaviruses, our options to prevent or treat coronavirus infections remain very limited. This highlights the importance of advancing our knowledge on the replication of these viruses and their interactions with the host. Compared to other +RNA viruses, coronaviruses have an exceptionally large genome and employ a complex genome expression strategy. Next to a role in basic virus replication or virus assembly, many of the coronavirus proteins expressed in the infected cell contribute to the coronavirus-host interplay. For example, by interacting with the host cell to create an optimal environment for coronavirus replication, by altering host gene expression or by counteracting the host's antiviral defenses. These coronavirus-host interactions are key to viral pathogenesis and will ultimately determine the outcome of infection. Due to the complexity of the coronavirus proteome and replication cycle, our knowledge of host factors involved in coronavirus replication is still in an early stage compared to what is known for some other +RNA viruses. This review summarizes our current understanding of coronavirus-host interactions at the level of the infected cell, with special attention for the assembly and function of the viral RNA-synthesising machinery and the evasion of cellular innate immune responses."

In [97]:
resp = get_snippets(question=question, title=title, abstract=abstract)

In [98]:
print(resp.title_sentences)
print(resp.abstract_sentences)
print(resp.score)
print(resp.chain_of_thought)

[]
['They mostly cause enteric or respiratory disease, which can be severe and life threatening, e.g., in the case of the zoonotic coronaviruses causing severe acute respiratory syndrome (SARS) and Middle East Respiratory Syndrome (MERS) in humans.']
0.9
The title does not provide any direct answers to the question. In the abstract, I found one sentence mentioning two human diseases caused by coronaviruses: SARS and MERS. I am confident in this answer, but I did not find any matching sentences in the title.


In [844]:
# correct
# title: []
# abstract:
# They mostly cause enteric or respiratory disease, which can be severe and life threatening, e.g., in the case of the zoonotic coronaviruses causing
# severe acute respiratory syndrome (SARS) and Middle East Respiratory Syndrome (MERS) in humans.

In [None]:
# correct
# title: []
# abstract:
# The host protein APOBEC3G (A3G) can limit HIV-1 replication. Its protective effect is overcome by the HIV-1 'viral infectivity factor' (Vif), which targets A3G for proteosomal degradation

In [899]:
# print(SNIPPET_TEMPLATE.format(question=question, title=title, abstract=abstract))

### Snippets with extractive QA models


In [11]:
question = (
    "What is the mechanism by which HIV-1-encoded Vif protein allows virus replication?"
)

title = "HIV-1 subtype variability in Vif derived from molecular clones affects APOBEC3G-mediated host restriction"
abstract = "Background: The host protein APOBEC3G (A3G) can limit HIV-1 replication. Its protective effect is overcome by the HIV-1 'viral infectivity factor' (Vif), which targets A3G for proteosomal degradation. Although Vif is considered to be essential for HIV-1 replication, the effect of Vif variability among commonly used HIV-1 molecular clones of different genetic backgrounds on viral infectiousness and pathogenesis has not been fully determined. Methods: We cloned the intact Vif coding regions of available molecular clones of different subtypes into expression vectors. Δvif full-length HIV-1 clonal variants were generated from corresponding subtype-specific full-length molecular clones. Replication-competent viruses were produced in 293T cells in the presence or absence of A3G, with Vif being supplied by the full-length HIV-1 clone or in trans. The extent of A3G-mediated restriction was then determined in a viral replication assay using a reporter cell line. Results and conclusions: In the absence of A3G, Vif subtype origin did not impact viral replication. In the presence of A3G the subtype origin of Vif had a differential effect on viral replication. Vif derived from a subtype C molecular clone was less effective at overcoming A3G-mediated inhibition than Vif derived from either subtype B or CRF02_AG molecular clones."

In [12]:
from transformers import pipeline

# model = "deutsche-telekom/bert-multi-english-german-squad2"
# tokenizer = "deutsche-telekom/bert-multi-english-german-squad2"
# model = "deepset/gelectra-large-germanquad"
# tokenizer = "deepset/gelectra-large-germanquad"
# model = "Abdullah22/my_finetuned_german_gelectra3"
# tokenizer = "Abdullah22/my_finetuned_german_gelectra3"
# model = "themariolinml/roberta-base-sqaud2-on-medical_meadow_medqa-v1"
# tokenizer = "themariolinml/roberta-base-sqaud2-on-medical_meadow_medqa-v1"
model = "bigwiz83/sapbert-from-pubmedbert-squad2"
tokenizer = "bigwiz83/sapbert-from-pubmedbert-squad2"

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    top_k=5,
    max_seq_len=512,
    max_question_len=15,
    max_answer_len=512,
    handle_impossible_answer=False,
    # torch_dtype=torch.bfloat16
)

# question = "Is London good?"
response = qa_pipeline(question, abstract)

In [976]:
# ideal answer
# The HIV-1 Vif protein counteracts the antiviral activity of the APOBEC3 family by targeting the proteins for degradation through
# the ubiquitin-proteasome pathway. More specifically, Vif, serving as a substrate receptor, facilitates ubiquitination of APOBEC3 proteins by forming
# a Cullin5-based E3 ubiquitin ligase complex, which targets APOBEC3 proteins for rapid proteasomal degradation.'

# correct span:
# The host protein APOBEC3G (A3G) can limit HIV-1 replication. Its protective effect is overcome by the HIV-1 'viral infectivity factor' (Vif),
# which targets A3G for proteosomal degradation

In [977]:
# These 2 are quite good for this example, they catch the shorter span inside the correct span
# model = "deepset/gelectra-large-germanquad"
# model = "Abdullah22/my_finetuned_german_gelectra3"
# They would fall for an adversary question like Is London good?, but the probabilities are much lower
# the second model would even return 'score': 0.019482115283608437, 'start': 0, 'end': 0, 'answer': ''} (handle_impossible_answer=True) setting it to True might be tricky
# since the model can place at top-1 the no answer with a very high probability
# bigwiz83/sapbert-from-pubmedbert-squad2 is also okay, seems to be more robust for adversarial non-medical questions

In [4]:
question

'What is the mechanism by which HIV-1-encoded Vif protein allows virus replication?'

In [13]:
response

[{'score': 0.6542913317680359,
  'start': 160,
  'end': 199,
  'answer': 'targets A3G for proteosomal degradation'},
 {'score': 0.13188529014587402,
  'start': 176,
  'end': 199,
  'answer': 'proteosomal degradation'},
 {'score': 0.04295399412512779,
  'start': 121,
  'end': 199,
  'answer': "viral infectivity factor' (Vif), which targets A3G for proteosomal degradation"},
 {'score': 0.0422237403690815,
  'start': 160,
  'end': 200,
  'answer': 'targets A3G for proteosomal degradation.'},
 {'score': 0.031905144453048706,
  'start': 160,
  'end': 171,
  'answer': 'targets A3G'}]