In [None]:
# Create a dictionary mapping PMIDs to cell line data
def create_pmid_to_celllines_mapping(gt_data):
    """
    Create a dictionary where:
    - Keys: PMIDs (excluding "Missing")
    - Values: List of cell line data objects that have publications with that PMID
    """
    
    # Step 1: Get all unique PMIDs (excluding "Missing")
    all_pmids = set()
    
    for cell_line_key, cell_line_data in gt_data.items():
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            for pub in cell_line_data['publications']:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid != "Missing" and pmid is not None:
                        all_pmids.add(pmid)
    
    print(f"Found {len(all_pmids)} unique PMIDs (excluding 'Missing')")
    
    # Step 2: Create mapping from PMID to cell lines
    pmid_to_celllines = {}
    
    for pmid in all_pmids:
        pmid_to_celllines[pmid] = []
        
        # Find all cell lines that have this PMID
        for cell_line_key, cell_line_data in gt_data.items():
            if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
                # Check if this cell line has a publication with this PMID
                has_pmid = False
                for pub in cell_line_data['publications']:
                    if isinstance(pub, dict) and 'pmid' in pub and pub['pmid'] == pmid:
                        has_pmid = True
                        break
                
                if has_pmid:
                    # Add the entire cell line data object
                    pmid_to_celllines[pmid].append({
                        'cell_line_id': cell_line_key,
                        'data': cell_line_data
                    })
    
    # Step 3: Print summary statistics
    print(f"\nPMID to Cell Lines Mapping Summary:")
    print(f"Total unique PMIDs: {len(pmid_to_celllines)}")
    
    # Show distribution of how many cell lines per PMID
    cellline_counts = [len(cell_lines) for cell_lines in pmid_to_celllines.values()]
    from collections import Counter
    count_distribution = Counter(cellline_counts)
    
    print(f"Distribution of cell lines per PMID:")
    for count, frequency in sorted(count_distribution.items()):
        print(f"  {count} cell line(s): {frequency} PMID(s)")
    
    # Show examples of PMIDs with multiple cell lines
    multi_cellline_pmids = {pmid: cell_lines for pmid, cell_lines in pmid_to_celllines.items() if len(cell_lines) > 1}
    if multi_cellline_pmids:
        print(f"\nExample PMIDs with multiple cell lines:")
        for pmid, cell_lines in list(multi_cellline_pmids.items())[:5]:
            cell_line_ids = [cl['cell_line_id'] for cl in cell_lines]
            print(f"  PMID {pmid}: {cell_line_ids}")
    
    return pmid_to_celllines


# Create the mapping
pmid_to_celllines = create_pmid_to_celllines_mapping(gt_data)

NameError: name 'gt_data' is not defined

In [None]:
# Create a dictionary mapping PMIDs to cell line data
def create_pmid_to_celllines_mapping(gt_data):
    """
    Create a dictionary where:
    - Keys: PMIDs (excluding "Missing")
    - Values: List of cell line data objects that have publications with that PMID
    """
    
    # Step 1: Get all unique PMIDs (excluding "Missing")
    all_pmids = set()
    
    for cell_line_key, cell_line_data in gt_data.items():
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            for pub in cell_line_data['publications']:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid != "Missing" and pmid is not None:
                        all_pmids.add(pmid)
    
    print(f"Found {len(all_pmids)} unique PMIDs (excluding 'Missing')")
    
    # Step 2: Create mapping from PMID to cell lines
    pmid_to_celllines = {}
    
    for pmid in all_pmids:
        pmid_to_celllines[pmid] = []
        
        # Find all cell lines that have this PMID
        for cell_line_key, cell_line_data in gt_data.items():
            if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
                # Check if this cell line has a publication with this PMID
                has_pmid = False
                for pub in cell_line_data['publications']:
                    if isinstance(pub, dict) and 'pmid' in pub and pub['pmid'] == pmid:
                        has_pmid = True
                        break
                
                if has_pmid:
                    # Add the entire cell line data object
                    pmid_to_celllines[pmid].append({
                        'cell_line_id': cell_line_key,
                        'data': cell_line_data
                    })
    
    # Step 3: Print summary statistics
    print(f"\nPMID to Cell Lines Mapping Summary:")
    print(f"Total unique PMIDs: {len(pmid_to_celllines)}")
    
    # Show distribution of how many cell lines per PMID
    cellline_counts = [len(cell_lines) for cell_lines in pmid_to_celllines.values()]
    from collections import Counter
    count_distribution = Counter(cellline_counts)
    
    print(f"Distribution of cell lines per PMID:")
    for count, frequency in sorted(count_distribution.items()):
        print(f"  {count} cell line(s): {frequency} PMID(s)")
    
    # Show examples of PMIDs with multiple cell lines
    multi_cellline_pmids = {pmid: cell_lines for pmid, cell_lines in pmid_to_celllines.items() if len(cell_lines) > 1}
    if multi_cellline_pmids:
        print(f"\nExample PMIDs with multiple cell lines:")
        for pmid, cell_lines in list(multi_cellline_pmids.items())[:5]:
            cell_line_ids = [cl['cell_line_id'] for cl in cell_lines]
            print(f"  PMID {pmid}: {cell_line_ids}")
    
    return pmid_to_celllines

# Create the mapping
pmid_to_celllines = create_pmid_to_celllines_mapping(gt_data)

In [None]:
# Remove duplicate publications based on PMID
def remove_duplicate_publications(gt_data):
    """
    Remove duplicate publication objects within each cell line's publications list
    based on the pmid field.
    """
    duplicates_found = 0
    total_publications_before = 0
    total_publications_after = 0
    
    for cell_line_key, cell_line_data in gt_data.items():
        # Check if this cell line has publications
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            publications = cell_line_data['publications']
            original_count = len(publications)
            total_publications_before += original_count
            
            # Track seen PMIDs to identify duplicates
            seen_pmids = set()
            unique_publications = []
            
            for pub in publications:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid not in seen_pmids:
                        seen_pmids.add(pmid)
                        unique_publications.append(pub)
                    else:
                        print(f"Duplicate PMID found in {cell_line_key}: {pmid}")
                        duplicates_found += 1
                else:
                    # Keep publications without PMID field
                    unique_publications.append(pub)
            
            # Update the publications list
            cell_line_data['publications'] = unique_publications
            total_publications_after += len(unique_publications)
            
            # Report if duplicates were removed for this cell line
            if len(unique_publications) < original_count:
                print(f"  Removed {original_count - len(unique_publications)} duplicate(s) from {cell_line_key}")
    
    print(f"\nSummary:")
    print(f"Total duplicates found and removed: {duplicates_found}")
    print(f"Total publications before: {total_publications_before}")
    print(f"Total publications after: {total_publications_after}")
    print(f"Publications reduced by: {total_publications_before - total_publications_after}")
    
    return gt_data

# Apply the deduplication
ground_truth_data = remove_duplicate_publications(ground_truth_data)

In [None]:
import json
import os
from pathlib import Path
from pprint import pprint

# Load all JSON files from ground_truth directory into a dictionary
ground_truth_dir = Path("ground_truth")
gt_data = {}    

# Read each JSON file and store with filename stem as key
for json_file in ground_truth_dir.glob("*.json"):
    file_stem = json_file.stem.split("_")[0]  # Gets filename without extension
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            gt_data[file_stem] = json.load(f)
    except Exception as e:
        print(f"Error loading {json_file}: {e}")

print(f"\nTotal files loaded: {len(gt_data)}")
pprint(list(gt_data.keys())[:10])

In [None]:
pprint(gt_data["AIBNi001-A"])

In [None]:
# Remove duplicate publications based on PMID
def remove_duplicate_publications(gt_data):
    """
    Remove duplicate publication objects within each cell line's publications list
    based on the pmid field.
    """
    duplicates_found = 0
    total_publications_before = 0
    total_publications_after = 0
    
    for cell_line_key, cell_line_data in gt_data.items():
        # Check if this cell line has publications
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            publications = cell_line_data['publications']
            original_count = len(publications)
            total_publications_before += original_count
            
            # Track seen PMIDs to identify duplicates
            seen_pmids = set()
            unique_publications = []
            
            for pub in publications:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid not in seen_pmids:
                        seen_pmids.add(pmid)
                        unique_publications.append(pub)
                    else:
                        print(f"Duplicate PMID found in {cell_line_key}: {pmid}")
                        duplicates_found += 1
                else:
                    # Keep publications without PMID field
                    unique_publications.append(pub)
            
            # Update the publications list
            cell_line_data['publications'] = unique_publications
            total_publications_after += len(unique_publications)
            
            # Report if duplicates were removed for this cell line
            if len(unique_publications) < original_count:
                print(f"  Removed {original_count - len(unique_publications)} duplicate(s) from {cell_line_key}")
    
    print(f"\nSummary:")
    print(f"Total duplicates found and removed: {duplicates_found}")
    print(f"Total publications before: {total_publications_before}")
    print(f"Total publications after: {total_publications_after}")
    print(f"Publications reduced by: {total_publications_before - total_publications_after}")
    
    return gt_data

# Apply the deduplication
gt_data = remove_duplicate_publications(gt_data)

In [None]:
for cell_line_key, cell_line_data in gt_data.items():
    if len(cell_line_data["publications"]) == 1 and cell_line_data['publications'][0]['pmid'] == "Missing":
        print(cell_line_key)

In [None]:
 # Create a dictionary mapping PMIDs to cell line data
def create_pmid_to_celllines_mapping(gt_data):
    """
    Create a dictionary where:
    - Keys: PMIDs (excluding "Missing")
    - Values: List of cell line data objects that have publications with that PMID
    """

    # Step 1: Get all unique PMIDs (excluding "Missing")
    all_pmids = set()

    for cell_line_key, cell_line_data in gt_data.items():
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            for pub in cell_line_data['publications']:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid != "Missing" and pmid is not None:
                        all_pmids.add(pmid)

    print(f"Found {len(all_pmids)} unique PMIDs (excluding 'Missing')")

    # Step 2: Create mapping from PMID to cell lines
    pmid_to_celllines = {}

    for pmid in all_pmids:
        pmid_to_celllines[pmid] = []

        # Find all cell lines that have this PMID
        for cell_line_key, cell_line_data in gt_data.items():
            if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
                # Check if this cell line has a publication with this PMID
                has_pmid = False
                for pub in cell_line_data['publications']:
                    if isinstance(pub, dict) and 'pmid' in pub and pub['pmid'] == pmid:
                        has_pmid = True
                        break

                if has_pmid:
                    # Add the entire cell line data object
                    pmid_to_celllines[pmid].append({
                        'cell_line_id': cell_line_key,
                        'data': cell_line_data
                    })

    # Step 3: Print summary statistics
    print(f"\nPMID to Cell Lines Mapping Summary:")
    print(f"Total unique PMIDs: {len(pmid_to_celllines)}")

    # Show distribution of how many cell lines per PMID
    cellline_counts = [len(cell_lines) for cell_lines in pmid_to_celllines.values()]
    from collections import Counter
    count_distribution = Counter(cellline_counts)

    print(f"Distribution of cell lines per PMID:")
    for count, frequency in sorted(count_distribution.items()):
        print(f"  {count} cell line(s): {frequency} PMID(s)")

    # Show examples of PMIDs with multiple cell lines
    multi_cellline_pmids = {pmid: cell_lines for pmid, cell_lines in pmid_to_celllines.items() if len(cell_lines) > 1}
    if multi_cellline_pmids:
        print(f"\nExample PMIDs with multiple cell lines:")
        for pmid, cell_lines in list(multi_cellline_pmids.items())[:5]:
            cell_line_ids = [cl['cell_line_id'] for cl in cell_lines]
            print(f"  PMID {pmid}: {cell_line_ids}")

    return pmid_to_celllines

# Create the mapping
pmid_to_celllines = create_pmid_to_celllines_mapping(gt_data)

In [None]:
# Cell lines journals lists

# It gives a list of cell lines and journals they are published in, where we have the journal information
# So we are excluding cell lines which don't have journal information. 
# I need to know and want to report on which journal it came from.
cell_line_journals = dict()

for cell_line_key, cell_line_data in gt_data.items():
    journals = []
    for pub in cell_line_data["publications"]:
        if "journal" in pub and pub["journal"] != "Missing":
            journals.append(pub["journal"])
    cell_line_journals[cell_line_key] = journals


# Should include the hypothesis about structured reporting increases the performance of the curation process..

In [None]:
# Cell lines which appear in more than one journal

more_than_one_journal = [cell_line for cell_line, journals in cell_line_journals.items() if len(journals) > 1]

print(len(more_than_one_journal))



In [None]:
# Cell lines which appear in exactly one journal
exactly_one_journal = [cell_line for cell_line, journals in cell_line_journals.items() if len(journals) == 1]   
print(len(exactly_one_journal))
 

# Get the set of unique journal names for cell lines that appear in exactly one journal, ignoring case
unique_journals = set()
journal_name_map = dict()  # maps lowercased journal name to original for display

for cell_line in exactly_one_journal:
    journals = cell_line_journals[cell_line]
    if journals:  # should always be length 1, but check anyway
        journal_lower = journals[0].lower()
        unique_journals.add(journal_lower)
        # Store the first encountered original-case version for display
        if journal_lower not in journal_name_map:
            journal_name_map[journal_lower] = journals[0]

print(len(unique_journals))

# Display the unique journals in their original case (first encountered)
from pprint import pprint
pprint([journal_name_map[j] for j in unique_journals])

from collections import Counter

# Count how many cell lines are associated with each journal (for cell lines in exactly one journal), ignoring case
journal_counts = Counter()
for cell_line in exactly_one_journal:
    journals = cell_line_journals[cell_line]
    if journals:  # should always be length 1
        journal_lower = journals[0].lower()
        journal_counts[journal_lower] += 1


In [None]:

# Print the counts
for journal, count in journal_counts.items():
    print(f"{journal}: {count}")


113 cell lines from SCR
28 cell lines from non-SCR


Why did you only take cell lines which were associated with exactly one journal?
Because I could not tell which journal article the data in the registry was associated with. So i needed to be sure that the data was from a specific publication, from a specific journal. So that I could run the automated curation on this publication and be sure that I had the right ground truth to compare my results to.

In [None]:
pprint(exactly_one_journal)

In [None]:
ground_truth_pmids = [] 
for cell_line in exactly_one_journal:
    pmid = gt_data[cell_line]["publications"][0]["pmid"]
    if pmid != "Missing":
        ground_truth_pmids.append(pmid)

pprint(ground_truth_pmids)
print(len(ground_truth_pmids))


# These are the pmids I need to run the automated curation on.



In [None]:

# Also get the journal name for the pmid

ground_truth_pmid_journals = []
for cell_line in exactly_one_journal:
    pmid = gt_data[cell_line]["publications"][0]["pmid"]
    journal = gt_data[cell_line]["publications"][0].get("journal", None)
    if pmid != "Missing":
        ground_truth_pmid_journals.append({"pmid": pmid, "journal": journal})

pprint(ground_truth_pmid_journals)
print(len(ground_truth_pmid_journals))

Alright. I got the files I need to curate. Now I need to curate them.

# Test 1. Curate a single file.


# Test 2. Curate every file.




Things to watch out for.
- When batch curating. I need a report of the curation session.
- Professional logging. To a loglife.
- Let's do it.


What needs to be written in the report about this. 
- Report on the curation instructions and how they were used.

Also need to write the scoring function next. Tomorrow. After everything is done.
Also need to write the nature news essay. But I don't particularly care about the marks.