In [1]:
import re
import pandas as pd
from openpyxl.utils.exceptions import IllegalCharacterError

# Define a function to extract fields using regex patterns
def extract_field(data, field_name, stop_field=None):
    """
    Extract the text for a specific field name and stop at the next field name if provided.
    """
    # Make the regex pattern case-insensitive
    pattern = rf"{field_name}:\s*(.*)"
    match = re.search(pattern, data, re.DOTALL | re.IGNORECASE)
    
    if not match:
        #print(f"Warning: '{field_name}' not found in the data.")
        return None

    # Extract field data and trim any extra whitespace
    field_data = match.group(1).strip()
    #print(f"Extracted '{field_name}' data: {field_data[:100]}...")  # Show the first 100 characters for debugging

    if stop_field:
        # Look for the stop field as a separate word (allowing for newline and spaces around it)
        stop_pattern = rf"(\s*{stop_field}\s*{stop_field})"
        #print(stop_pattern)
        stop_match = re.search(stop_pattern, field_data, re.IGNORECASE)

        if stop_match:
            stop_index = stop_match.start()
            field_data = field_data[:stop_index].strip()
            #print(f"'{stop_field}' found in '{field_name}'. Truncated data to: {field_data[:100]}...")
        #else:
            #print(f"Warning: '{stop_field}' not found after '{field_name}'.")

    return field_data

def stop_field(data, stop_field):
    # Look for the stop field as a separate word (allowing for newline and spaces around it)
        stop_pattern = rf"(\s*{stop_field}\s*)"
        #print(stop_pattern)
        if data == None:
            return None
        stop_match = re.search(stop_pattern, data, re.IGNORECASE)

        if stop_match:
            stop_index = stop_match.start()
            data = data[:stop_index].strip()
            #print(f"'{stop_field}' found in '{field_name}'. Truncated data to: {field_data[:100]}...")
            return data
        else:
            #print(f"Warning: '{stop_field}' not found after '{field_name}'.")
            return data.strip()
    
        
def clean_illegal_characters(text):
    """
    Removes illegal characters from the text that can't be written to an Excel file.
    Also prints the original text with illegal characters for manual inspection.
    """
    if not text:
        return text

    # Find all illegal characters in the text
    illegal_chars = re.findall(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', text)

    if illegal_chars:
        print("Found illegal characters in the text:")
        print(f"Original text: {text}")
        print(f"Illegal characters: {repr(illegal_chars)}")

    # Remove illegal characters
    cleaned_text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', text)
    
    return cleaned_text



# Define the function to parse each chunk of output data
def parse_chunk(chunk):
    # Extract all the necessary fields using regex and stop at the next field to prevent overlap
    pmid = stop_field(extract_field(chunk, "PMID"), "Title")
    title = stop_field(extract_field(chunk, "Title"), "Authors")
    authors = stop_field(extract_field(chunk, "Authors"), "DOI")
    doi = stop_field(extract_field(chunk, "DOI"), "Abstract")
    classification = "no" if "Article is not about a Brucella vaccine." in chunk else "yes"
    # use 'yes' and 'no' to detect abstract
    if classification == "yes":
        abstract = extract_field(chunk, "Abstract", "yes")
    elif classification == "no":
        abstract = extract_field(chunk, "Abstract", "no")
    #classification = "no" if "Article is not about a Brucella vaccine." in chunk else "yes"

    # Extract fields, ensuring that one field doesn't overlap with the next
    allchunk = extract_field(chunk, "             Vaccine Introduction")
    vaccine_introduction = stop_field(allchunk, "Vaccine Type")
    vaccine_type = stop_field(extract_field(str(allchunk), "Vaccine Type"), "Vaccine Antigen")
    vaccine_antigen = stop_field(extract_field(str(allchunk), "Vaccine Antigen"), "Vaccine Formulation")
    vaccine_formulation = stop_field(extract_field(str(allchunk), "Vaccine Formulation"), "Host Species Used as Laboratory Animal Model")
    host_species = stop_field(extract_field(str(allchunk), "Host Species Used as Laboratory Animal Model"), "Experiment Used")
    experiment_used = stop_field(extract_field(chunk, "Experiment Used"), 'Data: PMID')

    # Clean illegal characters in all fields
    return {
        'PMID': clean_illegal_characters(pmid),
        'Title': clean_illegal_characters(title),
        'Authors': clean_illegal_characters(authors),
        'DOI': clean_illegal_characters(doi),
        'Citation': clean_illegal_characters(title)+clean_illegal_characters(authors)+clean_illegal_characters(doi),
        'Abstract': clean_illegal_characters(abstract),
        'Classification': clean_illegal_characters(classification),
        'Vaccine Introduction': clean_illegal_characters(vaccine_introduction),
        'Vaccine Type': clean_illegal_characters(vaccine_type),
        'Vaccine Antigen': clean_illegal_characters(vaccine_antigen),
        'Vaccine Formulation': clean_illegal_characters(vaccine_formulation),
        'Host Species Used as Laboratory Animal Model': clean_illegal_characters(host_species),
        'Experiment Used to investigate the vaccine': clean_illegal_characters(experiment_used)
    }

# Function to process the entire text file with multiple chunks
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        data = file.read()
        
    # Split the data by the separator "================================================================================"
    chunks = data.split("================================================================================")
    parsed_data = [parse_chunk(chunk.strip()) for chunk in chunks if chunk.strip()]

    # Create a DataFrame with the required columns
    df = pd.DataFrame(parsed_data, columns=[
        'PMID', 'Title', 'Authors', 'DOI', 'Citation', 'Abstract', 'Classification',
        'Vaccine Introduction', 'Vaccine Type', 'Vaccine Antigen', 'Vaccine Formulation', 
        'Host Species Used as Laboratory Animal Model', 'Experiment Used to investigate the vaccine'
    ])

    # Save the DataFrame to an Excel file
    output_file = "output_vaccine_data_test.xlsx"
    try:
        df.to_excel(output_file, index=False)
        print(f"Excel file '{output_file}' created successfully.")
    except IllegalCharacterError as e:
        print(f"Error writing to Excel file: {e}")

# Example usage
file_path = r"/Users/laurel/Downloads/classification_results.txt"
process_file(file_path)

Excel file 'output_vaccine_data_test.xlsx' created successfully.
