In [None]:
# Create a dictionary mapping PMIDs to cell line data
def create_pmid_to_celllines_mapping(gt_data):
    """
    Create a dictionary where:
    - Keys: PMIDs (excluding "Missing")
    - Values: List of cell line data objects that have publications with that PMID
    """
    
    # Step 1: Get all unique PMIDs (excluding "Missing")
    all_pmids = set()
    
    for cell_line_key, cell_line_data in gt_data.items():
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            for pub in cell_line_data['publications']:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid != "Missing" and pmid is not None:
                        all_pmids.add(pmid)
    
    print(f"Found {len(all_pmids)} unique PMIDs (excluding 'Missing')")
    
    # Step 2: Create mapping from PMID to cell lines
    pmid_to_celllines = {}
    
    for pmid in all_pmids:
        pmid_to_celllines[pmid] = []
        
        # Find all cell lines that have this PMID
        for cell_line_key, cell_line_data in gt_data.items():
            if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
                # Check if this cell line has a publication with this PMID
                has_pmid = False
                for pub in cell_line_data['publications']:
                    if isinstance(pub, dict) and 'pmid' in pub and pub['pmid'] == pmid:
                        has_pmid = True
                        break
                
                if has_pmid:
                    # Add the entire cell line data object
                    pmid_to_celllines[pmid].append({
                        'cell_line_id': cell_line_key,
                        'data': cell_line_data
                    })
    
    # Step 3: Print summary statistics
    print(f"\nPMID to Cell Lines Mapping Summary:")
    print(f"Total unique PMIDs: {len(pmid_to_celllines)}")
    
    # Show distribution of how many cell lines per PMID
    cellline_counts = [len(cell_lines) for cell_lines in pmid_to_celllines.values()]
    from collections import Counter
    count_distribution = Counter(cellline_counts)
    
    print(f"Distribution of cell lines per PMID:")
    for count, frequency in sorted(count_distribution.items()):
        print(f"  {count} cell line(s): {frequency} PMID(s)")
    
    # Show examples of PMIDs with multiple cell lines
    multi_cellline_pmids = {pmid: cell_lines for pmid, cell_lines in pmid_to_celllines.items() if len(cell_lines) > 1}
    if multi_cellline_pmids:
        print(f"\nExample PMIDs with multiple cell lines:")
        for pmid, cell_lines in list(multi_cellline_pmids.items())[:5]:
            cell_line_ids = [cl['cell_line_id'] for cl in cell_lines]
            print(f"  PMID {pmid}: {cell_line_ids}")
    
    return pmid_to_celllines

# Create the mapping
pmid_to_celllines = create_pmid_to_celllines_mapping(gt_data)

In [None]:
# Create a dictionary mapping PMIDs to cell line data
def create_pmid_to_celllines_mapping(gt_data):
    """
    Create a dictionary where:
    - Keys: PMIDs (excluding "Missing")
    - Values: List of cell line data objects that have publications with that PMID
    """
    
    # Step 1: Get all unique PMIDs (excluding "Missing")
    all_pmids = set()
    
    for cell_line_key, cell_line_data in gt_data.items():
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            for pub in cell_line_data['publications']:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid != "Missing" and pmid is not None:
                        all_pmids.add(pmid)
    
    print(f"Found {len(all_pmids)} unique PMIDs (excluding 'Missing')")
    
    # Step 2: Create mapping from PMID to cell lines
    pmid_to_celllines = {}
    
    for pmid in all_pmids:
        pmid_to_celllines[pmid] = []
        
        # Find all cell lines that have this PMID
        for cell_line_key, cell_line_data in gt_data.items():
            if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
                # Check if this cell line has a publication with this PMID
                has_pmid = False
                for pub in cell_line_data['publications']:
                    if isinstance(pub, dict) and 'pmid' in pub and pub['pmid'] == pmid:
                        has_pmid = True
                        break
                
                if has_pmid:
                    # Add the entire cell line data object
                    pmid_to_celllines[pmid].append({
                        'cell_line_id': cell_line_key,
                        'data': cell_line_data
                    })
    
    # Step 3: Print summary statistics
    print(f"\nPMID to Cell Lines Mapping Summary:")
    print(f"Total unique PMIDs: {len(pmid_to_celllines)}")
    
    # Show distribution of how many cell lines per PMID
    cellline_counts = [len(cell_lines) for cell_lines in pmid_to_celllines.values()]
    from collections import Counter
    count_distribution = Counter(cellline_counts)
    
    print(f"Distribution of cell lines per PMID:")
    for count, frequency in sorted(count_distribution.items()):
        print(f"  {count} cell line(s): {frequency} PMID(s)")
    
    # Show examples of PMIDs with multiple cell lines
    multi_cellline_pmids = {pmid: cell_lines for pmid, cell_lines in pmid_to_celllines.items() if len(cell_lines) > 1}
    if multi_cellline_pmids:
        print(f"\nExample PMIDs with multiple cell lines:")
        for pmid, cell_lines in list(multi_cellline_pmids.items())[:5]:
            cell_line_ids = [cl['cell_line_id'] for cl in cell_lines]
            print(f"  PMID {pmid}: {cell_line_ids}")
    
    return pmid_to_celllines

# Create the mapping
pmid_to_celllines = create_pmid_to_celllines_mapping(gt_data)

In [None]:
# Remove duplicate publications based on PMID
def remove_duplicate_publications(gt_data):
    """
    Remove duplicate publication objects within each cell line's publications list
    based on the pmid field.
    """
    duplicates_found = 0
    total_publications_before = 0
    total_publications_after = 0
    
    for cell_line_key, cell_line_data in gt_data.items():
        # Check if this cell line has publications
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            publications = cell_line_data['publications']
            original_count = len(publications)
            total_publications_before += original_count
            
            # Track seen PMIDs to identify duplicates
            seen_pmids = set()
            unique_publications = []
            
            for pub in publications:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid not in seen_pmids:
                        seen_pmids.add(pmid)
                        unique_publications.append(pub)
                    else:
                        print(f"Duplicate PMID found in {cell_line_key}: {pmid}")
                        duplicates_found += 1
                else:
                    # Keep publications without PMID field
                    unique_publications.append(pub)
            
            # Update the publications list
            cell_line_data['publications'] = unique_publications
            total_publications_after += len(unique_publications)
            
            # Report if duplicates were removed for this cell line
            if len(unique_publications) < original_count:
                print(f"  Removed {original_count - len(unique_publications)} duplicate(s) from {cell_line_key}")
    
    print(f"\nSummary:")
    print(f"Total duplicates found and removed: {duplicates_found}")
    print(f"Total publications before: {total_publications_before}")
    print(f"Total publications after: {total_publications_after}")
    print(f"Publications reduced by: {total_publications_before - total_publications_after}")
    
    return gt_data

# Apply the deduplication
ground_truth_data = remove_duplicate_publications(ground_truth_data)

In [9]:
import json
import os
from pathlib import Path
from pprint import pprint

# Load all JSON files from ground_truth directory into a dictionary
ground_truth_dir = Path("ground_truth")
gt_data = {}    

# Read each JSON file and store with filename stem as key
for json_file in ground_truth_dir.glob("*.json"):
    file_stem = json_file.stem.split("_")[0]  # Gets filename without extension
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            gt_data[file_stem] = json.load(f)
    except Exception as e:
        print(f"Error loading {json_file}: {e}")

print(f"\nTotal files loaded: {len(gt_data)}")
pprint(list(gt_data.keys())[:10])


Total files loaded: 315
['VCCRIi013-A',
 'MNZTASi002-A',
 'MCRIi001-A-5',
 'LEIi008-A',
 'LEIi012-B',
 'UQi004-A',
 'UOWi007-A',
 'VCCRIi029-A',
 'MNZTASi020-B',
 'AIBNi005-A']


In [12]:
pprint(gt_data["AIBNi001-A"])

{'basic_data': [{'cell_line_alt_name': 'GENIE 1',
                 'cell_type': 'hiPSC',
                 'frozen': 'True'}],
 'contact': [{'e_mail': ' e.wolvetang@uq.edu.au',
              'first_name': 'Ernst',
              'group': 'Wolvetang-AIBN',
              'last_name': 'Wolvetang',
              'name_initials': 'J',
              'phone_number': 'Missing'}],
 'culture_medium': [{'co2_concentration': '0.05',
                     'o2_concentration': 'Missing',
                     'passage_method': 'EF'}],
 'differentiation_results': [{'cell_type': 'EN',
                              'description': '',
                              'marker_list': 'FOXA2; SOX17',
                              'method_used': 'RT-qPCR',
                              'show_potency': 'True'},
                             {'cell_type': 'ME',
                              'description': '',
                              'marker_list': 'HAND1; RUNX1',
                              'method_used': 'RT-

In [13]:
# Remove duplicate publications based on PMID
def remove_duplicate_publications(gt_data):
    """
    Remove duplicate publication objects within each cell line's publications list
    based on the pmid field.
    """
    duplicates_found = 0
    total_publications_before = 0
    total_publications_after = 0
    
    for cell_line_key, cell_line_data in gt_data.items():
        # Check if this cell line has publications
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            publications = cell_line_data['publications']
            original_count = len(publications)
            total_publications_before += original_count
            
            # Track seen PMIDs to identify duplicates
            seen_pmids = set()
            unique_publications = []
            
            for pub in publications:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid not in seen_pmids:
                        seen_pmids.add(pmid)
                        unique_publications.append(pub)
                    else:
                        print(f"Duplicate PMID found in {cell_line_key}: {pmid}")
                        duplicates_found += 1
                else:
                    # Keep publications without PMID field
                    unique_publications.append(pub)
            
            # Update the publications list
            cell_line_data['publications'] = unique_publications
            total_publications_after += len(unique_publications)
            
            # Report if duplicates were removed for this cell line
            if len(unique_publications) < original_count:
                print(f"  Removed {original_count - len(unique_publications)} duplicate(s) from {cell_line_key}")
    
    print(f"\nSummary:")
    print(f"Total duplicates found and removed: {duplicates_found}")
    print(f"Total publications before: {total_publications_before}")
    print(f"Total publications after: {total_publications_after}")
    print(f"Publications reduced by: {total_publications_before - total_publications_after}")
    
    return gt_data

# Apply the deduplication
gt_data = remove_duplicate_publications(gt_data)

Duplicate PMID found in LEIi008-A: 30611018
  Removed 1 duplicate(s) from LEIi008-A
Duplicate PMID found in LEIi012-B: 32810830
  Removed 1 duplicate(s) from LEIi012-B
Duplicate PMID found in UOWi007-A: 32006803
  Removed 1 duplicate(s) from UOWi007-A
Duplicate PMID found in AIBNi005-A: 34649201
  Removed 1 duplicate(s) from AIBNi005-A
Duplicate PMID found in HPIi004-B: 35728440
  Removed 1 duplicate(s) from HPIi004-B
Duplicate PMID found in MCRIi001-A: Missing
Duplicate PMID found in MCRIi001-A: Missing
Duplicate PMID found in MCRIi001-A: Missing
Duplicate PMID found in MCRIi001-A: Missing
Duplicate PMID found in MCRIi001-A: Missing
  Removed 5 duplicate(s) from MCRIi001-A
Duplicate PMID found in GENEAe012-A: 18386991
  Removed 1 duplicate(s) from GENEAe012-A
Duplicate PMID found in MCRIi019-A: 33002832
Duplicate PMID found in MCRIi019-A: 33316599
Duplicate PMID found in MCRIi019-A: 34543885
  Removed 3 duplicate(s) from MCRIi019-A
Duplicate PMID found in MCRIi001-A-4: 32771907
  Remo

In [None]:
for cell_line_key, cell_line_data in gt_data.items():
    if len(cell_line_data["publications"]) == 1 and cell_line_data['publications'][0]['pmid'] == "Missing":
        print(cell_line_key)

VCCRIi013-A
MCRIi001-A-5
UQi004-A
VCCRIi029-A
MNZTASi020-B
MNZTASi025-A
MUCCSi003-A
ESIBIe003-A
MCRIi028-A-1
MCRIi019-A-4
VCCRIi028-A
FINi005-A
UOWi004-A
MCRIi028-A
MNZTASi024-A
VCCRIi031-A
MNZTASi032-A
MUCCSi004-A
VCCRIi014-A
MNZTASi023-A
VCCRIi012-A
MNZTASi020-A
MCRIi031-A-3
MNZTASi030-A
UQi008-A
UTSWi003-A-3
VCCRIi004-A
UQi009-A
MNZTASi014-A
VCCRIi027-A
UOMELBi001-A-4
MNZTASi029-A
MCRIi019-A-3
MUCCSi002-A
LEIi020-A
VCCRIi035-A
MCRIi033-A
VCCRIi036-A
VCCRIi025-A
WIMRi001-A
VCCRIi017-A
MNZTASi011-A
UOWi009-A
VCCRIi009-A
UOWi003-A-1
UTSWi001-A-2
WIMRi003-A
MUCCSi001-A
VCCRIi006-A
VCCRIi030-A
VCCRIi019-A
HPIi011-A
CIAUi002-A
FINi006-A
VCCRIi011-A
MNZTASi016-A
MNZTASi010-A
VCCRIi018-A
UTSWi002-A-2
UQi003-A
FINi004-A
UTSWi002-A-3
MNZTASi034-B
MCRIi031-A
UOMELBi001-A-2
MNZTASi015-C
UOMELBi001-A
UTSWi003-A-2
SCSe001-A
MCRIi010-A-2
MNZTASi028-A
MNZTASi017-A
WIMRi004-A
UOMELBi001-A-3
VCCRIi032-A
MNZTASi009-B
LEIi019-A
VCCRIi008-A
MNZTASi034-A
MNZTASi015-B
MCRIi001-D
MNZTASi018-A
MNZTASi015-A


In [18]:
 # Create a dictionary mapping PMIDs to cell line data
def create_pmid_to_celllines_mapping(gt_data):
    """
    Create a dictionary where:
    - Keys: PMIDs (excluding "Missing")
    - Values: List of cell line data objects that have publications with that PMID
    """

    # Step 1: Get all unique PMIDs (excluding "Missing")
    all_pmids = set()

    for cell_line_key, cell_line_data in gt_data.items():
        if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
            for pub in cell_line_data['publications']:
                if isinstance(pub, dict) and 'pmid' in pub:
                    pmid = pub['pmid']
                    if pmid != "Missing" and pmid is not None:
                        all_pmids.add(pmid)

    print(f"Found {len(all_pmids)} unique PMIDs (excluding 'Missing')")

    # Step 2: Create mapping from PMID to cell lines
    pmid_to_celllines = {}

    for pmid in all_pmids:
        pmid_to_celllines[pmid] = []

        # Find all cell lines that have this PMID
        for cell_line_key, cell_line_data in gt_data.items():
            if 'publications' in cell_line_data and isinstance(cell_line_data['publications'], list):
                # Check if this cell line has a publication with this PMID
                has_pmid = False
                for pub in cell_line_data['publications']:
                    if isinstance(pub, dict) and 'pmid' in pub and pub['pmid'] == pmid:
                        has_pmid = True
                        break

                if has_pmid:
                    # Add the entire cell line data object
                    pmid_to_celllines[pmid].append({
                        'cell_line_id': cell_line_key,
                        'data': cell_line_data
                    })

    # Step 3: Print summary statistics
    print(f"\nPMID to Cell Lines Mapping Summary:")
    print(f"Total unique PMIDs: {len(pmid_to_celllines)}")

    # Show distribution of how many cell lines per PMID
    cellline_counts = [len(cell_lines) for cell_lines in pmid_to_celllines.values()]
    from collections import Counter
    count_distribution = Counter(cellline_counts)

    print(f"Distribution of cell lines per PMID:")
    for count, frequency in sorted(count_distribution.items()):
        print(f"  {count} cell line(s): {frequency} PMID(s)")

    # Show examples of PMIDs with multiple cell lines
    multi_cellline_pmids = {pmid: cell_lines for pmid, cell_lines in pmid_to_celllines.items() if len(cell_lines) > 1}
    if multi_cellline_pmids:
        print(f"\nExample PMIDs with multiple cell lines:")
        for pmid, cell_lines in list(multi_cellline_pmids.items())[:5]:
            cell_line_ids = [cl['cell_line_id'] for cl in cell_lines]
            print(f"  PMID {pmid}: {cell_line_ids}")

    return pmid_to_celllines

# Create the mapping
pmid_to_celllines = create_pmid_to_celllines_mapping(gt_data)

Found 115 unique PMIDs (excluding 'Missing')

PMID to Cell Lines Mapping Summary:
Total unique PMIDs: 115
Distribution of cell lines per PMID:
  1 cell line(s): 45 PMID(s)
  2 cell line(s): 37 PMID(s)
  3 cell line(s): 11 PMID(s)
  4 cell line(s): 8 PMID(s)
  5 cell line(s): 5 PMID(s)
  6 cell line(s): 4 PMID(s)
  7 cell line(s): 2 PMID(s)
  8 cell line(s): 1 PMID(s)
  9 cell line(s): 1 PMID(s)
  11 cell line(s): 1 PMID(s)

Example PMIDs with multiple cell lines:
  PMID 36355287: ['MCRIi001-A', 'MCRIi001-A-4', 'MCRIi001-A-3']
  PMID 33497524: ['LEIi004-A', 'LEIi004-A-1']
  PMID 33002832: ['MCRIi019-A', 'MCRIi019-A-2']
  PMID 32446218: ['MCRIi001-A', 'MCRIi001-A-2', 'MCRIi001-B']
  PMID 36166872: ['AIBNi015-A', 'AIBNi017-A', 'AIBNi016-A', 'AIBNi018-A']


In [None]:
# Cell lines journals lists

# It gives a list of cell lines and journals they are published in, where we have the journal information
# So we are excluding cell lines which don't have journal information. 
# I need to know and want to report on which journal it came from.
cell_line_journals = dict()

for cell_line_key, cell_line_data in gt_data.items():
    journals = []
    for pub in cell_line_data["publications"]:
        if "journal" in pub and pub["journal"] != "Missing":
            journals.append(pub["journal"])
    cell_line_journals[cell_line_key] = journals


# Should include the hypothesis about structured reporting increases the performance of the curation process..

In [31]:
# Cell lines which appear in more than one journal

more_than_one_journal = [cell_line for cell_line, journals in cell_line_journals.items() if len(journals) > 1]

print(len(more_than_one_journal))



37


In [44]:
# Cell lines which appear in exactly one journal
exactly_one_journal = [cell_line for cell_line, journals in cell_line_journals.items() if len(journals) == 1]   
print(len(exactly_one_journal))
 

# Get the set of unique journal names for cell lines that appear in exactly one journal, ignoring case
unique_journals = set()
journal_name_map = dict()  # maps lowercased journal name to original for display

for cell_line in exactly_one_journal:
    journals = cell_line_journals[cell_line]
    if journals:  # should always be length 1, but check anyway
        journal_lower = journals[0].lower()
        unique_journals.add(journal_lower)
        # Store the first encountered original-case version for display
        if journal_lower not in journal_name_map:
            journal_name_map[journal_lower] = journals[0]

print(len(unique_journals))

# Display the unique journals in their original case (first encountered)
from pprint import pprint
pprint([journal_name_map[j] for j in unique_journals])

from collections import Counter

# Count how many cell lines are associated with each journal (for cell lines in exactly one journal), ignoring case
journal_counts = Counter()
for cell_line in exactly_one_journal:
    journals = cell_line_journals[cell_line]
    if journals:  # should always be length 1
        journal_lower = journals[0].lower()
        journal_counts[journal_lower] += 1


141
8
['Stem cell research & therapy',
 'Cells',
 'Stem cell research',
 'Cloning and stem cells',
 'Stem cell reports',
 'Stem cells and development',
 'In vitro cellular & developmental biology. Animal',
 'Molecular genetics & genomic medicine']


In [45]:

# Print the counts
for journal, count in journal_counts.items():
    print(f"{journal}: {count}")


stem cell research: 113
stem cells and development: 8
cloning and stem cells: 11
molecular genetics & genomic medicine: 1
stem cell reports: 1
stem cell research & therapy: 3
cells: 2
in vitro cellular & developmental biology. animal: 2


113 cell lines from SCR
28 cell lines from non-SCR


Why did you only take cell lines which were associated with exactly one journal?
Because I could not tell which journal article the data in the registry was associated with. So i needed to be sure that the data was from a specific publication, from a specific journal. So that I could run the automated curation on this publication and be sure that I had the right ground truth to compare my results to.

In [47]:
pprint(exactly_one_journal)

['MNZTASi002-A',
 'LEIi008-A',
 'LEIi012-B',
 'UOWi007-A',
 'AIBNi005-A',
 'AIBNi015-A',
 'HPIi004-B',
 'POWHe001-A',
 'MCRIi030-A-2',
 'MCRIi019-A-5',
 'GENEAe012-A',
 'LEIi018-A',
 'MCRIi027-B',
 'MCRIi017-A',
 'MCRIi009-A',
 'GENEAe008-A',
 'MNZTASi005-A',
 'MONUi001-C',
 'MCRIi025-B',
 'ESIBIe003-A-10',
 'UQi001-A-1',
 'MONUi003-A',
 'AIBNi010-A',
 'MONUi001-A',
 'AIBNi006-A',
 'MCRIi020-A',
 'LEIi005-A',
 'MCRIi012-A',
 'GENEAe010-A',
 'AIBNi007-A',
 'AIBNi012-A',
 'LEIi012-A',
 'HPIi002-B',
 'AIBNi011-A',
 'MCRIi030-A',
 'CIAUi002-C',
 'GENEAe005-A',
 'SCSe001-A-2',
 'MCRIi015-A',
 'GENEAe009-A',
 'VCCRIi003-A',
 'AIBNi009-A',
 'GENEAe002-A',
 'HPIi009-A',
 'WAe009-A-24',
 'MNZTASi022-A',
 'MCRIi011-A',
 'POWHe001-A-4',
 'AIBNi004-A',
 'MCRIi003-A',
 'AIBNi008-A',
 'MCRIi013-A',
 'LEIi011-C',
 'MCRIi008-A',
 'MCRIi023-A',
 'MNZTASi021-A',
 'CIAUi003-A',
 'MCRIi019-A-6',
 'AIBNi003-A',
 'LEIi017-A',
 'HPIi004-A',
 'GENEAe004-A',
 'AIBNi017-A',
 'GENEAe003-A',
 'UOCi002-A',
 'MONUi

In [None]:
ground_truth_pmids = [] 
for cell_line in exactly_one_journal:
    pmid = gt_data[cell_line]["publications"][0]["pmid"]
    if pmid != "Missing":
        ground_truth_pmids.append(pmid)

pprint(ground_truth_pmids)
print(len(ground_truth_pmids))


# These are the pmids I need to run the automated curation on.

['35679759',
 '30611018',
 '32810830',
 '32006803',
 '34649201',
 '36166872',
 '35728440',
 '18271699',
 '38458031',
 '36682125',
 '18386991',
 '34214897',
 '36805468',
 '31415975',
 '37150143',
 '18386991',
 '35679759',
 '37494850',
 '36805468',
 '34619644',
 '37315423',
 '37494850',
 '35074713',
 '37494850',
 '34649201',
 '33091851',
 '32931148',
 '31415975',
 '18386991',
 '35074713',
 '35074713',
 '32810830',
 '34388489',
 '35074713',
 '38458031',
 '37385135',
 '18386991',
 '29499499',
 '31415975',
 '18386991',
 '37939621',
 '35074713',
 '18386991',
 '38582058',
 '32442534',
 '38433209',
 '31415975',
 '18271699',
 '34649201',
 '37150143',
 '35074713',
 '31415975',
 '31494449',
 '37150143',
 '33091851',
 '38433209',
 '31039485',
 '36682125',
 '34649201',
 '34198153',
 '35728440',
 '18386991',
 '36166872',
 '18386991',
 '35917601',
 '37494850',
 '18271699',
 '37939621',
 '34157503',
 '34388489',
 '33740643',
 '35728439',
 '18386991',
 '31494449',
 '37494850',
 '30622032',
 '30605840',

Alright. I got the files I need to curate. Now I need to curate them.

# Test 1. Curate a single file.


# Test 2. Curate every file.




Things to watch out for.
- When batch curating. I need a report of the curation session.
- Professional logging. To a loglife.
- Let's do it.


What needs to be written in the report about this. 
- Report on the curation instructions and how they were used.

Also need to write the scoring function next. Tomorrow. After everything is done.
Also need to write the nature news essay. But I don't particularly care about the marks.