'''

    Vous devez aussi mettre en place (hors de la data pipeline, vous pouvez considérer que c’est une partie
    annexe) une feature permettant de répondre à la problématique suivante :
    • Extraire depuis le json produit par la data pipeline le nom du journal qui mentionne le plus de
    médicaments différents ?

'''

## Ad-hoc Analysis Notebook

In [12]:
from pathlib import Path
import json
import pandas as pd

json_path = Path("/Users/zakaria/Development/solution/drug-mentions/src/data/output/drug_mentions.json")
print("Using JSON file:", json_path)

if not json_path.exists():
    raise FileNotFoundError(f"File not found: {json_path.resolve()}")

# Load the JSON data into a Python dictionary
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# empty dict to hold the mapping
journal_to_drugs = {}


for drug, details in data.items():
    # Retrieve the "mentions" dictionary; default to an empty dict if not present
    mentions = details.get("mentions", {})
    
    # same with journals ; default to an empty list if not present
    journals = mentions.get("journals", [])
    
    for journal in journals:
        # extract the journal's name and remove any leading/trailing whitespace
        name = journal.get("name", "").strip()
        
        if name:
            # If this journal isn't in our dictionary yet, add it with an empty set
            if name not in journal_to_drugs:
                journal_to_drugs[name] = set()
            # add the current drug to the set for this journal
            journal_to_drugs[name].add(drug)


# Convert the mapping to a DataFrame
rows = [
    {"journal": journal, "distinct_drugs": len(drugs), "drugs": list(drugs)}
    for journal, drugs in journal_to_drugs.items()
]
df = pd.DataFrame(rows)

if df.empty:
    print("No results found.")
else:
    #Sort the DataFrame by "distinct_drugs" in descending order.
    sorted_df = df.sort_values("distinct_drugs", ascending=False)

    # Get the first row from the sorted DataFrame (the journal with the most drugs).
    top_row = sorted_df.iloc[0]

    # Extract the journal name and count from that row.
    top_journal = top_row["journal"]
    drug_count = top_row["distinct_drugs"]

    # Print the result.
    print(
        f"Le journal qui mentionne le plus de médicaments différents est {top_journal} "
        f"avec {drug_count} médicaments différents : {top_row['drugs']}"
    )



Using JSON file: /Users/zakaria/Development/solution/drug-mentions/src/data/output/drug_mentions.json
Le journal qui mentionne le plus de médicaments différents est Psychopharmacology avec 2 médicaments différents : ['TETRACYCLINE', 'ETHANOL']
