In [None]:
%pip install transformers
%pip install accelerate

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

In [None]:
from huggingface_hub import login
login()

In [None]:
model = "Xingxian123/VaxLLM"
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipeline = transformers.pipeline(
"text-generation",
      model=model,
      torch_dtype=torch.float16,
      device = device
)

In [None]:
def read_and_split_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # Split content into sections based on the separator line
    sections = content.split("="*80)
    return [section.strip() for section in sections if section.strip()]

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM


# Set up pipeline
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,
)

# Step 1: Classification
classification_instruction = """
Task: Using the following data, is this article about a brucella vaccine? To classify an article as being about a brucella vaccine, you must successfully extract at least some information about the vaccine formulation. This includes details such as the antigen, protein, gene, adjuvant, or vaccine platform mentioned in the abstract.
"""

annotation_instruction = """
Task: Extract the following details using the given data: Vaccine Introduction,Vaccine Antigen, Vaccine Type, Vaccine Formulation, Host Species Used as Laboratory Animal Model, Experiment Used to investigate the vaccine Ensure each response is based solely on the provided data. Ensure the response is formatted as follows:
"""

response_template = """
Response:
Vaccine Introduction:
Vaccine Type:
Vaccine Antigen:
Vaccine Formulation:
Host Species Used as Laboratory Animal Model:
Experiment Used to investigate the vaccine:
"""

# Open a file to save the output
output_file_path = "/content/classification_results.txt"

with open(output_file_path, "w") as file:
    # Loop through sections of the file
    for section in read_and_split_file("/content/brucellosis_vaccine_papers.txt"):
        # Step 1: Classification
        classification_prompt = f"""
        {classification_instruction}

        Data: {section}
        """

        classification_result = pipeline(
            classification_prompt,
            do_sample=False,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            truncation=True,
            max_new_tokens=10,  # Limit tokens since we only need 'Yes' or 'No'
        )

        # Check if the model classified the article as relevant
        is_brucella = "yes" in classification_result[0]['generated_text'].lower()
        file.write(classification_result[0]['generated_text'] + "\n")
        print ("complete classfication")

        if is_brucella:
            # Step 2: Annotation
            annotation_prompt = f"""
            {annotation_instruction}

            Data: {section}

            {response_template}
            """

            annotation_result = pipeline(
                annotation_prompt,
                do_sample=False,
                #top_k=10,
                #temperature=0.7,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                truncation=True,
                max_new_tokens=256,
            )

            file.write(annotation_result[0]['generated_text'] + "\n")
            file.write("=" * 80 + "\n")
            print ("complete annotation")
        else:
            file.write(f"Data: {section}\n")
            file.write("Article is not about a Brucella vaccine.\n")
            file.write("=" * 80 + "\n")

print(f"Classification results saved to {output_file_path}")

