Takes the refrences transformed to json using anystyle and puts them to the GEMS SDRef bibliographic format. Checks the refrences excel file and uses they key there to update the key in the json record

In [3]:
import json
import pandas as pd

def load_reference_keys(file_path):
    """Load reference keys from Excel file."""
    df = pd.read_excel(file_path, sheet_name="Sheet1")
    return df[["Ref_full", "Ref_GEMS"]].dropna()  # Keep only relevant columns

from fuzzywuzzy import process  # Alternatively, use: from rapidfuzz import process

def find_partial_match(title, reference_df, threshold=80):
    """Find the best partial match for a given title in Ref_full."""
    matches = process.extractOne(title, reference_df["Ref_full"], score_cutoff=threshold)
    
    if matches:
        matched_title, score, index = matches  # Extract match details
        return reference_df.loc[index, "Ref_GEMS"]  # Return corresponding Ref_GEMS

    print("NOT FOUND")
    return ""  # Return empty if no match is found


def convert_bibliographic_references(bib_data, reference_df):
    """Convert bibliographic references to the required format with Ref_GEMS keys."""
    formatted_references = []

    for entry in bib_data:
        # Extract author names
        authors = []
        if "author" in entry:
            authors = [f"{a.get('given', '')} {a.get('family', '')}" for a in entry["author"]]
        
        # Get reference key using partial match
        title = entry.get("title", "")
        ref_key = find_partial_match(title, reference_df)

        # Split Ref_GEMS into its components (Author, Year, Type)
        key_parts = ref_key.split(":") if ref_key else ["", "", ""]
        author_key = key_parts[0] if len(key_parts) > 0 else ""
        year_key = key_parts[1] if len(key_parts) > 1 else ""
        type_key = key_parts[2] if len(key_parts) > 2 else ""

        # Extract necessary fields
        formatted_entry = {
            "dod": [
                {"dM": 72, "dN": 1, "dot": 0, "dyn": False, "id": 12, "label": "SDauth", "val": " and ".join(authors)},
                {"dM": 154, "dN": 1, "dot": 0, "dyn": True, "id": 13, "label": "SDtitl", "val": title},
                {"dM": 72, "dN": 1, "dot": 0, "dyn": False, "id": 14, "label": "SDedit", "val": entry.get("genre", "") + " " + entry.get("container-title", entry.get("publisher", ""))},
                {"dM": 72, "dN": 1, "dot": 0, "dyn": False, "id": 15, "label": "SDvoly", "val": f"{year_key}, {entry.get('volume', '')}"},
                {"dM": 72, "dN": 1, "dot": 0, "dyn": False, "id": 16, "label": "SDpage", "val": entry.get("page", "")},
                {"dM": 4, "dN": 1, "dot": 0, "dyn": True, "id": 17, "label": "SDabst", "val": "---"},
                {"dM": 1, "dN": 1, "dot": 32, "dyn": True, "id": 18, "label": "SDrefs", "val": [" "]},
                {"dM": 72, "dN": 1, "dot": 0, "dyn": False, "id": 19, "label": "SDnote", "val": "---"},
                {"dM": 0, "dN": 1, "dot": 32, "dyn": True, "id": 20, "label": "SD_kwd", "val": None}
            ],
            "key": [
                {"fl": 20, "fv": author_key, "fx": 0},  # Author name from Ref_GEMS
                {"fl": 5, "fv": year_key, "fx": 1},  # Year from Ref_GEMS
                {"fl": 7, "fv": type_key, "fx": 2}  # Reference type from Ref_GEMS
            ]
        }
        formatted_references.append(formatted_entry)
        print(author_key)

    return formatted_references

def save_json(data, filename="formatted_references.json"):
    """Save formatted data to a JSON file."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [4]:
# Load reference keys
reference_file = "references.xlsx"
reference_keys = load_reference_keys(reference_file)

# Read and process bibliographic JSON data
bibliographic_data = json.load(open("references.json", "r", encoding="utf-8"))
converted_data = convert_bibliographic_references(bibliographic_data, reference_keys)

# Print the output in JSON format
#print(json.dumps(converted_data, indent=4))
# Save formatted data to a JSON file
save_json(converted_data)

print(f"Formatted data has been saved to 'formatted_references.json'.")

Cox_ea
Brown_ea
Hummel_ea
Hummel_ea
Hummel_ea
Lemire_ea
Lemire_ea
Rand_ea
Miron
Tanger_ea
Shock_ea
Shock_ea
Robie_ea
Shen_ea
Blanc_ea
Ma_ea
Ma_ea_b
Ma_ea
Kulik
Lothenbach_ea
Myers_ea
Nied_ea
Kulik_ea
Miron_ea
Miron_ea_b
Li_ea
Miron_b
Chase
Wolery
Miron
Gurvich_ea
Gurvich_ea
Olin_ea
Formatted data has been saved to 'formatted_references.json'.
