# Defining Types of Articles

In [1]:
systematic_meta_analysis = [
    "Meta-Analysis",
    "Network Meta-Analysis",
    "Systematic Review"
    ]
   

In [2]:
clinical_trial = [
    "Adaptive Clinical Trial",
    "Clinical Study",
    "Clinical Trial",
    "Clinical Trial Protocol",
    "Clinical Trial, Phase I",
    "Clinical Trial, Phase II",
    "Clinical Trial, Phase III",
    "Clinical Trial, Phase IV",
    "Clinical Trial, Veterinary",
    "Controlled Clinical Trial",
    "Equivalence Trial",
    "Multicenter Study",
    "Pragmatic Clinical Trial",
    "Randomized Controlled Trial",
    "Randomized Controlled Trial, Veterinary"
]


In [3]:
observational_studies = [
    "Case Reports",
    "Comparative Study",
    "Evaluation Study",
    "Observational Study",
    "Observational Study, Veterinary",
    "Twin Study",
    "Validation Study",
    "Review",
    "Scientific Integrity Review",
    "Scoping Review"
]


In [4]:
# Initialize an empty list
all_study_types = []

# Extend the list with other lists
all_study_types.extend(systematic_meta_analysis)
all_study_types.extend(clinical_trial)
all_study_types.extend(observational_studies)

allowed_study_types = all_study_types

# Print the combined list
print("Total Study types:",len(all_study_types))




Total Study types: 28


# Loading Data

In [5]:
import pandas as pd

# Read CSV file
df = pd.read_csv('Output/pubmed_hits_raw.csv')  
df.rename(columns={'Search Term': 'Synonyms'}, inplace=True)

# View first 5 rows
df.head()

Unnamed: 0,Root Name,Synonyms,PMID,Title,Article Type,Publication Year,PubMed URL
0,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,40219102,Unlocking the Therapeutic Potential of Patchou...,Journal Article; Review,2025,https://pubmed.ncbi.nlm.nih.gov/40219102/
1,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,39511253,Comparative toxicity of three variant oils and...,Journal Article; Comparative Study,2024,https://pubmed.ncbi.nlm.nih.gov/39511253/
2,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,38708994,Unveiling the phyto-restorative potential of e...,Journal Article; Review,2024,https://pubmed.ncbi.nlm.nih.gov/38708994/
3,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,38623714,Unlocking liver health: Can tackling myosteato...,Journal Article; Review,2024,https://pubmed.ncbi.nlm.nih.gov/38623714/
4,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,38504994,The critical role of toll-like receptor 4 in b...,"Journal Article; Review; Research Support, Non...",2024,https://pubmed.ncbi.nlm.nih.gov/38504994/


In [6]:
df.shape

(3942, 7)

In [6]:
df.groupby(['Root Name'])['Root Name'].count().sort_values().head(25)

Root Name
Ylang Ylang        46
Patchouli          63
Bois de Landes     75
Sandalwood         81
Jasmine           110
Bergamot          145
Orange-Neroli     576
Orange/Neroli     585
Saffron           707
Lavender          742
Linalool          812
Name: Root Name, dtype: int64

# Adding Synonyms

In [10]:
# import numpy as np

# synonyms_df = pd.read_excel("Data/76 Ingredient List.xlsx")
# synonyms_df['Common Name(s)'] = synonyms_df['Common Name(s)'].astype(str).str.replace(',', ';')
# synonyms_df['Other Search Words'] = synonyms_df['Other Search Words'].astype(str).str.replace(',', ';')
# synonyms_df = synonyms_df.replace(['nan', np.nan], '') 
# synonyms_df['Synonyms'] = (
#     synonyms_df['Common Name(s)'].fillna('')
#     .str.strip() + '; ' +
#     synonyms_df['Scientific Name(s)'].fillna('')
#     .str.strip() + '; ' +
#     synonyms_df['Other Search Words'].fillna('')
#     .str.strip()
# )
# synonyms_df.head()

In [11]:
# df_merged = df.merge(
#     synonyms_df[['Ingredient', 'Synonyms']],  # only the columns we need
#     left_on='Root Name',               # key in df
#     right_on='Ingredient',             # key in synonyms_df
#     how='left'                         # keep all rows from df
# )

# # Drop duplicate 'Ingredient' column
# df = df_merged.drop(columns=['Ingredient'])
# df.head()

# Defining Neuraci Type

In [7]:
# Define mapping of NeuraciTypes
priority_mapping = [
    (systematic_meta_analysis,'Systematic Review & Meta Analysis'), # 1st Priority
    (clinical_trial, 'Clinical Trials'),                            # 2nd Priority
    (observational_studies, 'Observational Studies & Reviews'),     # 3rd Priority

]

# Function to assign NeuraciType based on priority
def classify_type(type_list):
    for reference_list, category in priority_mapping:
        if any(item in reference_list for item in type_list):
            return category
    return 'Unclassified'  # optional, in case nothing matches

# Create new column with a list of article types
df['Type_list'] = df['Article Type'].apply(lambda x: set([item.strip() for item in x.split(';')]))


# Apply function to create new column
df['PubMedType'] = df['Type_list'].apply(
    lambda x: ', '.join(x.intersection(set(allowed_study_types))) if isinstance(x, set) else ''
)
df["NeuraciType"] = df['Type_list'].apply(lambda x: pd.Series(classify_type(x)))

df.head()

Unnamed: 0,Root Name,Synonyms,PMID,Title,Article Type,Publication Year,PubMed URL,Type_list,PubMedType,NeuraciType
0,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,40219102,Unlocking the Therapeutic Potential of Patchou...,Journal Article; Review,2025,https://pubmed.ncbi.nlm.nih.gov/40219102/,"{Journal Article, Review}",Review,Observational Studies & Reviews
1,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,39511253,Comparative toxicity of three variant oils and...,Journal Article; Comparative Study,2024,https://pubmed.ncbi.nlm.nih.gov/39511253/,"{Journal Article, Comparative Study}",Comparative Study,Observational Studies & Reviews
2,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,38708994,Unveiling the phyto-restorative potential of e...,Journal Article; Review,2024,https://pubmed.ncbi.nlm.nih.gov/38708994/,"{Journal Article, Review}",Review,Observational Studies & Reviews
3,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,38623714,Unlocking liver health: Can tackling myosteato...,Journal Article; Review,2024,https://pubmed.ncbi.nlm.nih.gov/38623714/,"{Journal Article, Review}",Review,Observational Studies & Reviews
4,Patchouli,Patchouli; patchouli oil; Pogostemon oil; patc...,38504994,The critical role of toll-like receptor 4 in b...,"Journal Article; Review; Research Support, Non...",2024,https://pubmed.ncbi.nlm.nih.gov/38504994/,"{Journal Article, Research Support, Non-U.S. G...",Review,Observational Studies & Reviews


In [8]:
ingredients = ["Patchouli","Sandalwood","Ylang Ylang"]

In [15]:
# no_abstract_pmids = [
#     16942582,
#     18349580,
#     18781101,
#     19901509,
#     27333653,
#     27788889,
#     9536426
# ]

# none_pmids = [
#     23652808,
#     25163405,
#     25704097,
#     26165898,
#     28117719,
#     28460831,
#     28895508,
#     29543548,
#     30541521,
#     31618733,
#     32037737,
#     32407947,
#     32439404,
#     35917731
# ]



In [10]:
from meta_data_generation import process_pmids

#ingredient_df = df[df['Root Name'].isin([39069654])]
ingredient_df = df[df['Root Name'].isin(ingredients)]
#ingredient_df = df[(df['Root Name'].isin(ingredients)) & (df['NeuraciType'] == 'Clinical Trials')]
#ingredient_df = df[(df['Root Name'].isin(ingredients)) & (df['PMID'].isin([10205628, 138507, 15244513, 15453700,39069654]))]
process_pmids(ingredient_df['Root Name'],ingredient_df['Synonyms'],ingredient_df['PMID'], ingredient_df['PubMedType'],output_file="output/test_10|31.json") 

Processing PMIDs:  56%|█████▋    | 107/190 [00:00<00:00, 158.10pmid/s]

No abstract: 8789238
No abstract: 3443530
No abstract: 28150651


Processing PMIDs:  65%|██████▍   | 123/190 [00:03<00:02, 25.58pmid/s] 

No abstract: 6028683


Processing PMIDs:  83%|████████▎ | 157/190 [00:04<00:01, 30.16pmid/s]

No abstract: 11011949


Processing PMIDs:  86%|████████▌ | 163/190 [00:05<00:01, 20.96pmid/s]

No abstract: 8565465


Processing PMIDs: 100%|██████████| 190/190 [00:06<00:00, 28.16pmid/s]

No abstract: 4416336





[{'root_name': 'Patchouli',
  'synonyms': 'Patchouli; patchouli oil; Pogostemon oil; patchouli leaf; Pogostemon cablin, Pogostemon heyneanus, Patchouli terpenes, Pogostemon patchouli',
  'PMID': 40219102,
  'pubmed_type': 'Review',
  'metadata': {'duration_days': 'not mentioned',
   'sample_size': 'not mentioned',
   'sample_gender': ['not mentioned'],
   'species': ['plants'],
   'experimental_model': ['not mentioned'],
   'population': 'not mentioned',
   'study_type': ['not mentioned'],
   'focus': ['primary'],
   'benefits': ['anti-oxidant',
    'anti-inflammatory',
    'antimicrobial',
    'antidepressant',
    'anticancer'],
   'synergies_interactions_positive': ['not mentioned'],
   'synergies_interactions_negative': ['not mentioned'],
   'safety_side_effects': ['not mentioned'],
   'interventions': [],
   'usage': ['not mentioned'],
   'conditions': ['colds',
    'fevers',
    'headaches',
    'inflammation',
    'digestive disorders',
    'insect bites',
    'snake bites'],
  

# Enriching Data

In [19]:
import json

ingredient_df_lookup = ingredient_df.set_index(["PMID",'Root Name'])[["Publication Year","NeuraciType","Synonyms","PubMed URL"]].to_dict(orient="index")


# Read JSON file
with open("output/test_10|31.json", "r", encoding="utf-8") as f:
    json_list = json.load(f)

# Add attributes 
for record in json_list:
    pmid = record["PMID"]
    root_name = record['root_name']
    record['synonyms'] = [s.strip() for s in record['synonyms'].split(';')]
    key = (pmid,root_name)
    if key in ingredient_df_lookup:
        lookup_data = ingredient_df_lookup[key]
        record.setdefault("metadata", {})  # ensure "metadata" exists
        record["metadata"]["published_year"] = lookup_data["Publication Year"]
        record["metadata"]["neuraci_type"] = lookup_data["NeuraciType"]
        #record["metadata"]["synonyms"] = lookup_data["Synonyms"]
        record["metadata"]["url"] = lookup_data["PubMed URL"]
# Save JSON file
with open("output/test_enriched_10|31.json", "w", encoding="utf-8") as f:
    json.dump(json_list, f, indent=2, ensure_ascii=False)

In [21]:
print(len(json_list))

183


# Converting into Structured Data

In [22]:
import json
import pandas as pd

# # --- Step 1: Load JSON file ---
# with open("data.json", "r", encoding="utf-8") as f:
#     records = json.load(f)   # assuming it's a list of JSON objects

rows = []
records = json_list
# --- Step 2: Flatten each record ---
for rec in records:
    pmid = rec.get("PMID")
    root_name = rec.get("root_name")
    meta = rec.get("metadata", {})
    pubmed_type = rec.get("pubmed_type")

    row = {
        "root_name":root_name,
        "PMID": pmid,
        "synonyms":";".join(rec.get('synonyms',[])),
        "experimental_model":"".join(meta.get("experimental_model")),
        "published_year": meta.get("published_year"),
        "neuraci_type": meta.get("neuraci_type"),
        "pubmed_type":pubmed_type,
        "study_type": ";".join(meta.get('study_type')),
        "duration_days": meta.get("duration_days"),
        "sample_size": meta.get("sample_size"),
        "sample_gender": ";".join(meta.get("sample_gender", [])),
        "species": ";".join(meta.get("species", [])),
        "population": meta.get("population"),
        "purpose": meta.get("purpose"),
        "focus": "".join(meta.get("focus")),
        "benefits": ";".join(meta.get("benefits", [])),
        "synergies_interactions_positive": ";".join(meta.get("synergies_interactions_positive", [])),
        "synergies_interactions_negative": ";".join(meta.get("synergies_interactions_negative", [])),
        "safety_side_effects": ";".join(meta.get("safety_side_effects", [])),
        "conditions": ";".join(meta.get("conditions", [])),
        "biomarkers": ";".join(meta.get("biomarkers", [])),
        "symptoms": ";".join(meta.get("symptoms", [])),
        "keywords": ";".join(meta.get("keywords", [])),
        "diseases": ";".join(meta.get("diseases", [])),
        "mechanism": ";".join(meta.get("mechanism", [])),
        "usage": ";".join(meta.get("usage", [])),
        "location":meta.get("location"),
        "conclusion":meta.get("conclusion"),
        "url": meta.get("url", []),
    }

    # --- Expand interventions ---
    for i, iv in enumerate(meta.get("interventions", []), start=1):
        row[f"intervention{i}_ingredient"] = iv.get("ingredient")
        row[f"intervention{i}_daily_dosage"] = iv.get("daily_dosage")
        row[f"intervention{i}_units"] = iv.get("units")
        row[f"intervention{i}_original_text"] = iv.get("original_text")

    # --- Expand outcomes ---
    for j, oc in enumerate(meta.get("outcomes", []), start=1):
        row[f"outcome{j}_name"] = oc.get("name")
        row[f"outcome{j}_domain"] = oc.get("domain")
        row[f"outcome{j}_type"] = oc.get("type")
        row[f"outcome{j}_result"] = oc.get("result")

    rows.append(row)

# --- Step 3: Convert to DataFrame ---
output_df = pd.DataFrame(rows)

# --- Step 4: Save to CSV ---
output_df.to_csv("output/test_10|31.csv", index=False)
output_df.head()

Unnamed: 0,root_name,PMID,synonyms,experimental_model,published_year,neuraci_type,pubmed_type,study_type,duration_days,sample_size,...,outcome31_type,outcome31_result,outcome32_name,outcome32_domain,outcome32_type,outcome32_result,intervention3_ingredient,intervention3_daily_dosage,intervention3_units,intervention3_original_text
0,Patchouli,40219102,Patchouli;patchouli oil;Pogostemon oil;patchou...,not mentioned,2025,Observational Studies & Reviews,Review,not mentioned,not mentioned,not mentioned,...,,,,,,,,,,
1,Patchouli,39511253,Patchouli;patchouli oil;Pogostemon oil;patchou...,in vitro,2024,Observational Studies & Reviews,Comparative Study,not mentioned,not mentioned,not mentioned,...,,,,,,,,,,
2,Patchouli,38708994,Patchouli;patchouli oil;Pogostemon oil;patchou...,not mentioned,2024,Observational Studies & Reviews,Review,not mentioned,not mentioned,not mentioned,...,,,,,,,,,,
3,Patchouli,38623714,Patchouli;patchouli oil;Pogostemon oil;patchou...,in vivoin vitro,2024,Observational Studies & Reviews,Review,review,not mentioned,not mentioned,...,,,,,,,,,,
4,Patchouli,38504994,Patchouli;patchouli oil;Pogostemon oil;patchou...,in vivoin vitro,2024,Observational Studies & Reviews,Review,not mentioned,not mentioned,not mentioned,...,,,,,,,,,,


In [16]:
pd.set_option('display.max_columns', None)