# Defining Types of Articles

In [23]:
systematic_meta_analysis = [
    "Meta-Analysis",
    "Network Meta-Analysis",
    "Systematic Review"
    ]
   

In [24]:
clinical_trial = [
    "Adaptive Clinical Trial",
    "Clinical Study",
    "Clinical Trial",
    "Clinical Trial Protocol",
    "Clinical Trial, Phase I",
    "Clinical Trial, Phase II",
    "Clinical Trial, Phase III",
    "Clinical Trial, Phase IV",
    "Clinical Trial, Veterinary",
    "Controlled Clinical Trial",
    "Equivalence Trial",
    "Multicenter Study",
    "Pragmatic Clinical Trial",
    "Randomized Controlled Trial",
    "Randomized Controlled Trial, Veterinary"
]


In [25]:
observational_studies = [
    "Case Reports",
    "Comparative Study",
    "Evaluation Study",
    "Observational Study",
    "Observational Study, Veterinary",
    "Twin Study",
    "Validation Study",
    "Review",
    "Scientific Integrity Review",
    "Scoping Review"
]


In [26]:
# Initialize an empty list
all_study_types = []

# Extend the list with other lists
all_study_types.extend(systematic_meta_analysis)
all_study_types.extend(clinical_trial)
all_study_types.extend(observational_studies)

allowed_study_types = all_study_types

# Print the combined list
print("Total Study types:",len(all_study_types))




Total Study types: 28


# Loading Data

In [27]:
import pandas as pd

# Read Excel file
df = pd.read_excel('Data/entrez_ingredients_with_synonyms.xlsx')  # For .xls or .xlsx

# View first 5 rows
df.head()

Unnamed: 0,Root Name,PMID,Title,Article Type,Publication Year,PubMed URL,Search Term(s)
0,Acetyl L-Carnitine,10022226,Micronutrients prevent cancer and delay aging.,"Journal Article; Research Support, Non-U.S. Go...",1998,https://pubmed.ncbi.nlm.nih.gov/10022226/,Carnitine
1,Acetyl L-Carnitine,10030388,Significance of skeletal muscle properties on ...,Comparative Study; Journal Article; Research S...,1999,https://pubmed.ncbi.nlm.nih.gov/10030388/,Carnitine
2,Acetyl L-Carnitine,10036643,Cellular dysmetabolism: the dark side of HIV-1...,Journal Article; Review,1996,https://pubmed.ncbi.nlm.nih.gov/10036643/,Carnitine
3,Acetyl L-Carnitine,10052020,Substrate utilization and work efficiency duri...,Clinical Trial; Controlled Clinical Trial; Jou...,1999,https://pubmed.ncbi.nlm.nih.gov/10052020/,Carnitine
4,Acetyl L-Carnitine,10067662,L-carnitine improves glucose disposal in type ...,Clinical Trial; Controlled Clinical Trial; Jou...,1999,https://pubmed.ncbi.nlm.nih.gov/10067662/,Carnitine


In [28]:
df.shape

(277397, 7)

# Adding Synonyms

In [29]:
import numpy as np

synonyms_df = pd.read_excel("Data/76 Ingredient List.xlsx")
synonyms_df['Common Name(s)'] = synonyms_df['Common Name(s)'].astype(str).str.replace(',', ';')
synonyms_df['Other Search Words'] = synonyms_df['Other Search Words'].astype(str).str.replace(',', ';')
synonyms_df = synonyms_df.replace(['nan', np.nan], '') 
synonyms_df['Synonyms'] = (
    synonyms_df['Common Name(s)'].fillna('')
    .str.strip() + '; ' +
    synonyms_df['Scientific Name(s)'].fillna('')
    .str.strip() + '; ' +
    synonyms_df['Other Search Words'].fillna('')
    .str.strip()
)
synonyms_df.head()

Unnamed: 0,Ingredient,Common Name(s),Scientific Name(s),Other Search Words,Synonyms
0,Ashwagandha,Ashwagandha; Indian Ginseng; Winter Cherry,Withania somnifera,Withanolide; Withanolides,Ashwagandha; Indian Ginseng; Winter Cherry; Wi...
1,Chamomile,Chamomile; English Chamomile; German Chamomile...,Matricaria chamomilla; Chamaemelum nobile; Mat...,,Chamomile; English Chamomile; German Chamomile...
2,GABA,GABA,Gamma-Aminobutyric Acid,,GABA; Gamma-Aminobutyric Acid;
3,Glycine,Glycine; L-Glycine,Aminoacetic Acid,,Glycine; L-Glycine; Aminoacetic Acid;
4,Lemon Balm,Melissa; Balm Mint; Lemon Balm Extract,Melissa officinalis,,Melissa; Balm Mint; Lemon Balm Extract; Meliss...


In [30]:
df_merged = df.merge(
    synonyms_df[['Ingredient', 'Synonyms']],  # only the columns we need
    left_on='Root Name',               # key in df
    right_on='Ingredient',             # key in synonyms_df
    how='left'                         # keep all rows from df
)

# Drop duplicate 'Ingredient' column
df = df_merged.drop(columns=['Ingredient'])
df.head()

Unnamed: 0,Root Name,PMID,Title,Article Type,Publication Year,PubMed URL,Search Term(s),Synonyms
0,Acetyl L-Carnitine,10022226,Micronutrients prevent cancer and delay aging.,"Journal Article; Research Support, Non-U.S. Go...",1998,https://pubmed.ncbi.nlm.nih.gov/10022226/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...
1,Acetyl L-Carnitine,10030388,Significance of skeletal muscle properties on ...,Comparative Study; Journal Article; Research S...,1999,https://pubmed.ncbi.nlm.nih.gov/10030388/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...
2,Acetyl L-Carnitine,10036643,Cellular dysmetabolism: the dark side of HIV-1...,Journal Article; Review,1996,https://pubmed.ncbi.nlm.nih.gov/10036643/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...
3,Acetyl L-Carnitine,10052020,Substrate utilization and work efficiency duri...,Clinical Trial; Controlled Clinical Trial; Jou...,1999,https://pubmed.ncbi.nlm.nih.gov/10052020/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...
4,Acetyl L-Carnitine,10067662,L-carnitine improves glucose disposal in type ...,Clinical Trial; Controlled Clinical Trial; Jou...,1999,https://pubmed.ncbi.nlm.nih.gov/10067662/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...


# Defining Neuraci Type

In [31]:
# Define mapping of NeuraciTypes
priority_mapping = [
    (systematic_meta_analysis,'Systematic Review & Meta Analysis'), # 1st Priority
    (clinical_trial, 'Clinical Trials'),                            # 2nd Priority
    (observational_studies, 'Observational Studies & Reviews'),     # 3rd Priority

]

# Function to assign NeuraciType based on priority
def classify_type(type_list):
    for reference_list, category in priority_mapping:
        if any(item in reference_list for item in type_list):
            return category
    return 'Unclassified'  # optional, in case nothing matches

# Function to assign NeuraciType based on priority and return the matched item
# def classify_type(type_list):
#     for reference_list, category in priority_mapping:
#         for item in type_list:
#             if item in reference_list:
#                 return category, item  # Return both category and matched item
#     return 'Unclassified', None  # Default if nothing matches


# Create new column with a list of article types
df['Type_list'] = df['Article Type'].apply(lambda x: set([item.strip() for item in x.split(';')]))


# Apply function to create new column
#df['PubMedType'] = df['Type_list'].apply(lambda x: x.intersection(set(allowed_study_types)))
df['PubMedType'] = df['Type_list'].apply(
    lambda x: ', '.join(x.intersection(set(allowed_study_types))) if isinstance(x, set) else ''
)
df["NeuraciType"] = df['Type_list'].apply(lambda x: pd.Series(classify_type(x)))

df.head()

Unnamed: 0,Root Name,PMID,Title,Article Type,Publication Year,PubMed URL,Search Term(s),Synonyms,Type_list,PubMedType,NeuraciType
0,Acetyl L-Carnitine,10022226,Micronutrients prevent cancer and delay aging.,"Journal Article; Research Support, Non-U.S. Go...",1998,https://pubmed.ncbi.nlm.nih.gov/10022226/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...,"{Journal Article, Review, Research Support, No...",Review,Observational Studies & Reviews
1,Acetyl L-Carnitine,10030388,Significance of skeletal muscle properties on ...,Comparative Study; Journal Article; Research S...,1999,https://pubmed.ncbi.nlm.nih.gov/10030388/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...,"{Comparative Study, Research Support, Non-U.S....",Comparative Study,Observational Studies & Reviews
2,Acetyl L-Carnitine,10036643,Cellular dysmetabolism: the dark side of HIV-1...,Journal Article; Review,1996,https://pubmed.ncbi.nlm.nih.gov/10036643/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...,"{Journal Article, Review}",Review,Observational Studies & Reviews
3,Acetyl L-Carnitine,10052020,Substrate utilization and work efficiency duri...,Clinical Trial; Controlled Clinical Trial; Jou...,1999,https://pubmed.ncbi.nlm.nih.gov/10052020/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...,"{Clinical Trial, Controlled Clinical Trial, Re...","Clinical Trial, Controlled Clinical Trial",Clinical Trials
4,Acetyl L-Carnitine,10067662,L-carnitine improves glucose disposal in type ...,Clinical Trial; Controlled Clinical Trial; Jou...,1999,https://pubmed.ncbi.nlm.nih.gov/10067662/,Carnitine,Carnitine; ALCAR; Acetylcarnitine; Acetyl L Ca...,"{Clinical Trial, Controlled Clinical Trial, Jo...","Clinical Trial, Controlled Clinical Trial",Clinical Trials


In [None]:
df[df['Root Name'] == 'Marjoram']

In [32]:
ingredients = ["Cedarwood","Marjoram"]


In [None]:
no_abstract_pmids = [
    16942582,
    18349580,
    18781101,
    19901509,
    27333653,
    27788889,
    9536426
]

none_pmids = [
    23652808,
    25163405,
    25704097,
    26165898,
    28117719,
    28460831,
    28895508,
    29543548,
    30541521,
    31618733,
    32037737,
    32407947,
    32439404,
    35917731
]



In [33]:
from UPDATED_meta_data_generation import process_pmids

#ingredient_df = df[df['Root Name'].isin([10473175])]
ingredient_df = df[df['Root Name'].isin(ingredients)]
#ingredient_df = df[(df['Root Name'].isin(ingredients)) & (df['NeuraciType'] == 'Clinical Trials')]
#ingredient_df = df[(df['Root Name'].isin(ingredients)) & (df['PMID'].isin([37656114]))]
process_pmids(ingredient_df['Root Name'],ingredient_df['Search Term(s)'],ingredient_df['Synonyms'],ingredient_df['PMID'], ingredient_df['PubMedType'],output_file="output/Cedarwood_Marjoram_10|13.json") 

Processing PMIDs:   0%|          | 0/1 [00:00<?, ?pmid/s]

Synonyms Cedarwood; Cedrus; Cedar wood;Cedar-wood
Screening of Turkish anti-ulcerogenic folk remedies for anti-Helicobacter pylori activity.The anti-Helicobacter pylori effect of the extracts and fractions obtained from seven Turkish plants, which are used in folk medicine for the treatment of gastric ailments including peptic ulcers, were studied against one standard strain and eight clinical isolates of H. pylori by using the agar dilution method. Flowers of Cistus laurifolius and Spartium junceum, cones of Cedrus libani, herbs and flowers of Centaurea solstitialis ssp. solstitialis, fruits of Momordica charantia, herbaceous parts of Sambucus ebulus, and flowering herbs of Hypericum perforatum were evaluated in this study. Results showed that all except one extract from six of these plants showed activity against the microorganism with MICs between 1.95 and 250 microg/ml, with S. junceum being the only inactive species. Amongst the active plants the inhibitory properties of C. laurif

Processing PMIDs: 100%|██████████| 1/1 [00:12<00:00, 12.79s/pmid]


[{'root_name': 'Cedarwood',
  'search_term': 'Cedrus',
  'synonyms': 'Cedarwood; Cedrus; Cedar wood;Cedar-wood',
  'PMID': 10473175,
  'pubmed_type': 'Comparative Study',
  'metadata': {'duration_days': 'not mentioned',
   'sample_size': 'not mentioned',
   'sample_gender': ['not mentioned'],
   'species': ['microorganisms'],
   'experimental_model': ['in vitro'],
   'population': 'turkish plants used in folk medicine for gastric ailments including peptic ulcers',
   'study_type': ['not mentioned'],
   'focus': ['secondary'],
   'benefits': ['anti-helicobacter pylori activity'],
   'synergies_interactions_positive': ['not mentioned'],
   'synergies_interactions_negative': ['not mentioned'],
   'safety_side_effects': ['not mentioned'],
   'interventions': [{'ingredient': 'cedrus libani',
     'daily_dosage': None,
     'units': '',
     'original_text': 'cones of cedrus libani'}],
   'usage': ['not mentioned'],
   'conditions': ['peptic ulcers'],
   'biomarkers': [],
   'functions': [],

# Enriching Data

In [34]:
import json

ingredient_df_lookup = ingredient_df.set_index(["PMID",'Root Name'])[["Publication Year","NeuraciType","Synonyms","PubMed URL"]].to_dict(orient="index")


# Read JSON file
with open("output/cedarwood_Marjoram_10|13.json", "r", encoding="utf-8") as f:
    json_list = json.load(f)

# Add attributes 
for record in json_list:
    pmid = record["PMID"]
    root_name = record['root_name']
    key = (pmid,root_name)
    if key in ingredient_df_lookup:
        lookup_data = ingredient_df_lookup[key]
        record.setdefault("metadata", {})  # ensure "metadata" exists
        record["metadata"]["published_year"] = lookup_data["Publication Year"]
        record["metadata"]["neuraci_type"] = lookup_data["NeuraciType"]
        #record["metadata"]["synonyms"] = lookup_data["Synonyms"]
        record["metadata"]["url"] = lookup_data["PubMed URL"]

# Save JSON file
with open("output/cedarwood_Marjoram_enriched.json", "w", encoding="utf-8") as f:
    json.dump(json_list, f, indent=2, ensure_ascii=False)

# Converting into Structured Data

In [35]:
import json
import pandas as pd

# # --- Step 1: Load JSON file ---
# with open("data.json", "r", encoding="utf-8") as f:
#     records = json.load(f)   # assuming it's a list of JSON objects

rows = []
records = json_list
# --- Step 2: Flatten each record ---
for rec in records:
    pmid = rec.get("PMID")
    root_name = rec.get("root_name")
    search_term = rec.get("search_term")
    meta = rec.get("metadata", {})
    pubmed_type = rec.get("pubmed_type")
    synonyms = rec.get('synonyms')

    row = {
        "root_name":root_name,
        "Search Term(s)":search_term,
        "PMID": pmid,
        "synonyms":synonyms,
        "experimental_model":meta.get("experimental_model"),
        "published_year": meta.get("published_year"),
        "neuraci_type": meta.get("neuraci_type"),
        "pubmed_type":pubmed_type,
        "study_type": ";".join(meta.get('study_type')),
        "duration_days": meta.get("duration_days"),
        "sample_size": meta.get("sample_size"),
        "sample_gender": ";".join(meta.get("sample_gender", [])),
        "species": ";".join(meta.get("species", [])),
        "population": meta.get("population"),
        "purpose": meta.get("purpose"),
        "focus": "".join(meta.get("focus")),
        "benefits": ";".join(meta.get("benefits", [])),
        "synergies_interactions_positive": ";".join(meta.get("synergies_interactions_positive", [])),
        "synergies_interactions_negative": ";".join(meta.get("synergies_interactions_negative", [])),
        "safety_side_effects": ";".join(meta.get("safety_side_effects", [])),
        "conditions": ";".join(meta.get("conditions", [])),
        "biomarkers": ";".join(meta.get("biomarkers", [])),
        "symptoms": ";".join(meta.get("symptoms", [])),
        "keywords": ";".join(meta.get("keywords", [])),
        "diseases": ";".join(meta.get("diseases", [])),
        "mechanism": ";".join(meta.get("mechanism", [])),
        "usage": ";".join(meta.get("usage", [])),
        "location":meta.get("location"),
        "conclusion":meta.get("conclusion"),
        "url": meta.get("url", []),
    }

    # --- Expand interventions ---
    for i, iv in enumerate(meta.get("interventions", []), start=1):
        row[f"intervention{i}_ingredient"] = iv.get("ingredient")
        row[f"intervention{i}_daily_dosage"] = iv.get("daily_dosage")
        row[f"intervention{i}_units"] = iv.get("units")
        row[f"intervention{i}_original_text"] = iv.get("original_text")

    # --- Expand outcomes ---
    for j, oc in enumerate(meta.get("outcomes", []), start=1):
        row[f"outcome{j}_name"] = oc.get("name")
        row[f"outcome{j}_domain"] = oc.get("domain")
        row[f"outcome{j}_type"] = oc.get("type")
        row[f"outcome{j}_result"] = oc.get("result")

    rows.append(row)

# --- Step 3: Convert to DataFrame ---
output_df = pd.DataFrame(rows)

# --- Step 4: Save to CSV ---
output_df.to_csv("output/Cedarwood_Marjoram_10|13.csv", index=False)
output_df.head()

Unnamed: 0,root_name,Search Term(s),PMID,synonyms,experimental_model,published_year,neuraci_type,pubmed_type,study_type,duration_days,sample_size,sample_gender,species,population,purpose,focus,benefits,synergies_interactions_positive,synergies_interactions_negative,safety_side_effects,conditions,biomarkers,symptoms,keywords,diseases,mechanism,usage,location,conclusion,url,intervention1_ingredient,intervention1_daily_dosage,intervention1_units,intervention1_original_text,outcome1_name,outcome1_domain,outcome1_type,outcome1_result
0,Cedarwood,Cedrus,10473175,Cedarwood; Cedrus; Cedar wood;Cedar-wood,[in vitro],1999,Observational Studies & Reviews,Comparative Study,not mentioned,not mentioned,not mentioned,not mentioned,microorganisms,turkish plants used in folk medicine for gastr...,to study the anti-helicobacter pylori effect o...,secondary,anti-helicobacter pylori activity,not mentioned,not mentioned,not mentioned,peptic ulcers,,,anti-helicobacter pylori;turkish plants;folk m...,peptic ulcer,,not mentioned,,results showed that all except one extract fro...,https://pubmed.ncbi.nlm.nih.gov/10473175/,cedrus libani,,,cones of cedrus libani,anti-helicobacter pylori activity,condition,primary,improved


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
output_df.head()