Data transformed into question-answer pairs.py

In [55]:
import pandas as pd
import ast

def generate_qa_pairs(row):
    """Function to generate QA pairs for a given dataframe row, with handling for missing data."""
    qa_pairs = []
    # Default answer for missing information
    default_answer = "I'm sorry. Perhaps you can seek other avenues to obtain more accurate information."

    # Adding symptom question
    if pd.notna(row["Symptom"]):
        symptoms = ', '.join(ast.literal_eval(row["Symptom"]))  # Cleans up the symptom list
        symptom_answer = f'You may experience {symptoms}.'
    else:
        symptom_answer = default_answer
    qa_pairs.append({
        'question': f'What are the symptoms of {row["Disease"]}?',
        'answer': symptom_answer
    })

    # Add question and answer pairs, checking for missing data
    description_answer = row['Description'] if pd.notna(row["Description"]) else default_answer
    qa_pairs.append({
        'question': f'What is {row["Disease"]}?','answer': description_answer
    })

    qa_pairs.append({
        'question': f'Tell me about {row["Disease"]}.','answer': description_answer
    })

    if pd.notna(row["Precaution"]):
        precautions = ', '.join(ast.literal_eval(row["Precaution"]))
        precaution_answer = f'To prevent {row["Disease"]}, it is recommended to {precautions}.'
    else:
        precaution_answer = default_answer
    qa_pairs.append({How can I prevent {row["Disease"]}?
        'question': f'',
        'answer': precaution_answer
    })

    # Check if drug_rating is not NaN and add drug question
    if pd.notna(row['drug_rating']):
        #drugs = format_drug_list(row['drug_rating'])
        drug_answer = f'Based on user ratings, recommended medications for this condition, sorted by their ratings, are: {row["drug_rating"]}.'
    else:
        drug_answer = default_answer
    qa_pairs.append({
        'question': f'What medications should I take for {row["Disease"]}?','answer': drug_answer
    })

    return qa_pairs

# Load the dataset
df = pd.read_csv('./processed_merged_output.csv')

# Apply the function to each row and expand list of lists into a flat list
qa_data = df.apply(generate_qa_pairs, axis=1).explode().tolist()

# Convert list of dictionaries to DataFrame
qa_df = pd.DataFrame(qa_data)

# Optionally, save the DataFrame to a new CSV file
qa_df.to_csv('./qa_dataset.csv', index=False)


In [50]:
import pandas as pd
import ast

def format_drug_list(drug_rating_str):
    try:
        # Convert the string representation into a list of tuples using literal_eval
        drugs = ast.literal_eval(drug_rating_str)
        # Format each tuple into a readable string and sort them by the rating
        return ', '.join(f"{drug[0]} (rated {drug[1]}/10)" for drug in sorted(drugs, key=lambda x: x[1], reverse=True))
    except (ValueError, SyntaxError):
        # Return a default message if there is a parsing error
        return "No medication information available."

def generate_qa_pairs(row):
    """Function to generate QA pairs for a given dataframe row, with handling for missing data."""
    qa_pairs = []
    default_answer = "I'm sorry, I don't have that information."

    # Handling symptoms
    if pd.notna(row["Symptom"]):
        symptoms = ', '.join(ast.literal_eval(row["Symptom"]))
        symptom_answer = f'You may experience {symptoms}.'
    else:
        symptom_answer = default_answer

    qa_pairs.append({
        'question': f'What are the symptoms of {row["Disease"]}',
        'answer': symptom_answer
    })

    # Handling description
    description_answer = row['Description'] if pd.notna(row["Description"]) else default_answer
    qa_pairs.append({
        'question': f'What is {row["Disease"]}',
        'answer': description_answer
    })

    # Handling drug recommendations
    if pd.notna(row['drug_rating']):
        formatted_drugs = format_drug_list(row['drug_rating'])
        print(row['drug_rating'])
        print(formatted_drugs)
        drug_answer = f'Based on user ratings, recommended medications for this condition, sorted by their ratings, are: {formatted_drugs}.'
    else:
        drug_answer = default_answer

    qa_pairs.append({
        'question': f'What medications should I take for {row["Disease"]}',
        'answer': drug_answer
    })

    return qa_pairs

# Load the dataset
df = pd.read_csv('./processed_merged_output.csv')

# Apply the function to each row and expand list of lists into a flat list
qa_data = df.apply(generate_qa_pairs, axis=1).explode().tolist()

# Convert list of dictionaries to DataFrame
qa_df = pd.DataFrame(qa_data)

# Optionally, save the DataFrame to a new CSV file
qa_df.to_csv('./qa_dataset.csv', index=False)


["('cephalexin', 8.4)", "('isotretinoin', 8.0)", "('benzoyl peroxide', 7.8)", "('tretinoin', 7.7)", "('minocycline', 7.533333333333334)", "('tetracycline', 7.1000000000000005)", "('clindamycin', 6.6)", "('doxycycline', 6.6)", "('spironolactone', 6.333333333333333)", "('adapalene', 6.3)", "('sulfamethoxazole / trimethoprim', 5.475)"]
( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10), ( (rated '/10)
["('ampicillin', 10.0)", "('ceftriaxone', 7.966666666666667)", "('doxycycline', 6.6)"]
( (rated '/10), ( (rated '/10), ( (rated '/10)
["('lamivudine', 6.5)"]
( (rated '/10)
["('clindamycin', 6.6)", "('doxycycline', 6.6)", "('hydroxychloroquine', 6.1)"]
( (rated '/10), ( (rated '/10), ( (rated '/10)
["('aspirin / butalbital / caffeine', 9.2)", "('almotriptan', 9.0)", "('naratriptan', 8.4)", "('eletriptan', 8.3)", "('rizatriptan', 8.3)", "('cyclobenzaprine', 7.7)", "('sumatriptan', 7.6)"

In [38]:
import pandas as pd
import ast


def generate_qa_pairs(row):
    qa_pairs = []
    default_answer = "I'm sorry, I don't have that information."

    symptoms = ', '.join(ast.literal_eval(row["Symptom"])) if pd.notna(row["Symptom"]) else "symptoms data not available"
    qa_pairs.append({
        'question': f'What are the symptoms of {row["Disease"]}',
        'answer': f'You may experience {symptoms}.'
    })

    description = row['Description'] if pd.notna(row["Description"]) else default_answer
    qa_pairs.append({
        'question': f'What is {row["Disease"]}',
        'answer': description
    })

    qa_pairs.append({
        'question': f'Tell me about {row["Disease"]}',
        'answer': description
    })

    precaution = ', '.join(ast.literal_eval(row["Precaution"])) if pd.notna(row["Precaution"]) else "precaution data not available"
    qa_pairs.append({
        'question': f'How can I prevent {row["Disease"]}',
        'answer': f'To prevent {row["Disease"]}, it is recommended to {precaution}.'
    })

    if pd.notna(row['drug_rating']):
        formatted_drugs = format_drug_list(row['drug_rating'])
        qa_pairs.append({
            'question': f'What medications should I take for {row["Disease"]}',
            'answer': f'Based on user ratings, recommended medications for this condition, sorted by their ratings, are: {formatted_drugs}.'
        })
    else:
        qa_pairs.append({
            'question': f'What medications should I take for {row["Disease"]}',
            'answer': default_answer
        })

    return qa_pairs

# Assuming your DataFrame is already loaded correctly
qa_data = df.apply(generate_qa_pairs, axis=1).explode().tolist()

qa_df = pd.DataFrame(qa_data)
qa_df.to_csv('./qa_dataset.csv', index=False)

In [13]:
import pandas as pd
import ast
import re

def parse_list_string(list_string):
    """Parse a string representation of a list into a list."""
    try:
        # Safely evaluate the string as a list
        return ast.literal_eval(list_string)
    except (ValueError, SyntaxError):
        # If there's an error in evaluation, return an empty list
        return []

def list_to_natural_language(lst):
    """Convert a list into a natural language string."""
    if not lst:
        return "no specific information is available"
    if len(lst) == 1:
        return lst[0]
    return ', '.join(lst[:-1]) + ', and ' + lst[-1]

def parse_drug_ratings(drug_rating_string):
    """Parse and format drug ratings into a natural language string."""
    try:
        # Convert the string representation of a list into an actual list of tuples
        drugs_with_ratings = ast.literal_eval(drug_rating_string)
        # Formatting the string output
        formatted_drugs = ", ".join([f"{drug[0]} (rated {drug[1]}/10)" for drug in drugs_with_ratings])
        return f"Based on user ratings, recommended medications for this condition, sorted by their ratings, are: {formatted_drugs}."
    except (ValueError, SyntaxError) as e:
        # In case of evaluation error, return a message indicating the problem
        return f"An error occurred while processing the medication data: {e}"


def generate_qa_pairs(row):
    """Function to generate QA pairs for a given dataframe row, with handling for missing data."""
    qa_pairs = []
    default_answer = "I'm sorry. Perhaps you can seek other avenues to obtain more accurate information."

    # Description
    description = row['Description'] if pd.notna(row['Description']) else default_answer

    # Symptoms
    symptoms = parse_list_string(row['Symptom'])
    symptoms_text = list_to_natural_language(symptoms)

    # Precautions
    precautions = parse_list_string(row['Precaution'])
    precautions_text = list_to_natural_language(precautions)

    # Drug ratings
    drugs_text = parse_drug_ratings(row['drug_rating']) if pd.notna(row['drug_rating']) else default_answer

    # Add QA pairs
    qa_pairs.extend([
        {'question': f'What is {row["Disease"]}?', 'answer': description},
        {'question': f'What are the symptoms of {row["Disease"]}?','answer': f'Common symptoms may include {symptoms_text}.'},
        {'question': f'How can I prevent {row["Disease"]}?','answer': f'To prevent {row["Disease"]}, it is recommended to {precautions_text}.'},
        #{'question': f'What medications should I take for {row["Disease"]}?','answer': f'For {row["Disease"]}, the following medications are suggested: {drugs_text}.'},
    ])
    # Question and answer for medications
    if pd.notna(row['drug_rating']):
        drug_answer = parse_drug_ratings(row['drug_rating'])
        if drug_answer:
            qa_pairs.append({
                'question': f'What medications should I take for {row["Disease"]}?','answer': drug_answer
            })
        else:
            qa_pairs.append({
                'question': f'What medications should I take for {row["Disease"]}?','answer': default_answer
            })

    return qa_pairs

# Load the dataset
df = pd.read_csv('./processed_merged_output.csv')

# Apply the function to each row and expand list of lists into a flat list
qa_data = df.apply(generate_qa_pairs, axis=1).explode().tolist()

# Convert list of dictionaries to DataFrame
qa_df = pd.DataFrame(qa_data)

# Optionally, save the DataFrame to a new CSV file
qa_df.to_csv('qa_dataset-new.csv', index=False)


In [17]:
import pandas as pd
import ast


def parse_drug_ratings(drug_rating_str):
    # Convert the string representation of list of tuples into an actual list of tuples
    try:
        drug_rating_list = ast.literal_eval(drug_rating_str)
        # Sort the list by ratings in descending order
        sorted_drug_rating_list = sorted(drug_rating_list, key=lambda x: x[1], reverse=True)
        # Format the list into a readable string
        formatted_drug_list = ', '.join([f"{drug[0]} (rated {drug[1]}/10)" for drug in sorted_drug_rating_list])
        return formatted_drug_list
    except ValueError:
        # Handle the case where the string is not a valid list of tuples
        return "No medication information available."

def generate_qa_pairs(row):
    """Function to generate QA pairs for a given dataframe row, with handling for missing data."""
    qa_pairs = []
    default_answer = "I'm sorry, I don't have that information."

    # Description QA pair
    description = row['Description'] if pd.notna(row['Description']) else default_answer
    qa_pairs.append({
        'question': f'What is {row["Disease"]}?',
        'answer': description
    })

    # Symptom QA pair
    if pd.notna(row['Symptom']):
        symptoms_list = ast.literal_eval(row['Symptom'])
        formatted_symptoms = ', '.join(symptoms_list)
        qa_pairs.append({
            'question': f'What are the symptoms of {row["Disease"]}?','answer': f'The symptoms of {row["Disease"]} may include {formatted_symptoms}.'
        })
    else:
        qa_pairs.append({
            'question': f'What are the symptoms of {row["Disease"]}?','answer': default_answer
        })

    # Precaution QA pair
    if pd.notna(row['Precaution']):
        precautions_list = ast.literal_eval(row['Precaution'])
        formatted_precautions = ', '.join(precautions_list)
        qa_pairs.append({
            'question': f'How can I prevent {row["Disease"]}?','answer': f'To prevent {row["Disease"]}, you should {formatted_precautions}.'
        })
    else:
        qa_pairs.append({
            'question': f'How can I prevent {row["Disease"]}?','answer': default_answer
        })

    # Medication QA pair
    if pd.notna(row['drug_rating']):
        drug_answer = parse_drug_ratings(row['drug_rating'])
        if drug_answer:
            qa_pairs.append({
                'question': f'What medications should I take for {row["Disease"]}?','answer': drug_answer
            })
        else:
            qa_pairs.append({
                'question': f'What medications should I take for {row["Disease"]}?','answer': default_answer
            })
    else:
        qa_pairs.append({
            'question': f'What medications should I take for {row["Disease"]}?','answer': default_answer
        })

    return qa_pairs

# Read the dataset
df = pd.read_csv('./processed_merged_output.csv')

# Apply the function to each row
qa_data = df.apply(generate_qa_pairs, axis=1).explode().tolist()

# Convert list of dictionaries to DataFrame
qa_df = pd.DataFrame(qa_data)

# Save the DataFrame to a new CSV file
qa_df.to_csv('qa_dataset-1.csv', index=False)