## Fuzzy Matching Publisher Data

In [1]:
import pandas as pd
import json
from rapidfuzz import process, fuzz

# Load Top Publishers csv file with Lat/Lon + address Info 
publishers_df = pd.read_csv("publishers_address.csv")

# Lowercase publisher names for consistent matching
publishers_df['publisher'] = publishers_df['publisher'].str.lower()

top_publishers = publishers_df['publisher'].unique().tolist()

# Prepare List of Yearly CSV Files, loop to create list of all years csv files
csv_files = [f"df_{year}.csv" for year in range(1912, 1923)]
years = list(range(1912, 1923))

output_data = []

# Manual OCR spelling Fixes - helpful??
ocr_corrections = {
    "macmillax": "macmillan",
    "macmillay": "macmillan",
    "longmavs": "longmans",
    "loxgmans": "longmans",
    "longyans": "longmans",
    "constanle": "constable",
    "constarle": "constable",
    "constartle": "constable",
    "jlongmans": "longmans",
}

# Loop Through Yearly CSVs 
for file, year in zip(csv_files, years):
    print(f"\n📄 Processing {file} for year {year}")
    
    df = pd.read_csv(file)

    # Normalize publisher column
    if 'publisher' not in df.columns:
        print(f"⚠️ 'publisher' column not found in {file}, skipping.")
        continue

    pub_counts = {pub: 0 for pub in top_publishers}

    # drops na values
    for raw_pub in df['publisher'].dropna():
        raw_pub = raw_pub.lower().strip()
        raw_pub = ocr_corrections.get(raw_pub, raw_pub)

        # fuzzy match
        match, score, _ = process.extractOne(raw_pub, top_publishers, scorer=fuzz.ratio)

        if score >= 80:  # can change this
            pub_counts[match] += 1
        else:
            print(f"❌ No strong match: '{raw_pub}' → '{match}' (score: {score})")

    for pub, count in pub_counts.items():
        if count > 0:
            row = publishers_df[
                (publishers_df['publisher'] == pub) & (publishers_df['year'] == year)
            ]
            if row.empty:
                print(f"⚠️ No location info for '{pub}' in year {year}")
                continue

            lat = row.iloc[0]['latitude']
            lon = row.iloc[0]['longitude']

            # Skip rows with missing or invalid coordinates
            if pd.isna(lat) or pd.isna(lon):
                print(f"⚠️ Skipping '{pub}' in {year} due to missing lat/lon")
                continue

            addr = row.iloc[0].get('ecb_address', "")

            output_data.append({
                "publisher": pub.title(),  # title case for display
                "year": year,
                "books_published": count,
                "latitude": lat,
                "longitude": lon,
                "ecb_address": addr
            })

            print(f"✅ Matched {count} books to '{pub.title()}' in {year}")

# Write to JSON File 
with open("publishers.json", "w") as f:
    json.dump(output_data, f, indent=2)

print(f"\n✅ Done! {len(output_data)} entries written to publishers.json")



📄 Processing df_1912.csv for year 1912
❌ No strong match: 'frowde' → 'milford' (score: 46.15384615384615)
❌ No strong match: 'luzac' → 'longmans' (score: 30.76923076923077)
❌ No strong match: 'bailliere' → 'macmillan' (score: 44.44444444444444)
❌ No strong match: 'yorkshire printing co.' → 'simpkin 1' (score: 38.70967741935484)
❌ No strong match: 'hodder & s.' → 'constable ' (score: 28.57142857142857)
❌ No strong match: 'luzac' → 'longmans' (score: 30.76923076923077)
❌ No strong match: 'nister' → 'constable ' (score: 50.0)
❌ No strong match: 'g. p. abraham' → 'longmans' (score: 19.047619047619047)
❌ No strong match: 'mills & b.' → 'macmillan' (score: 42.10526315789473)
❌ No strong match: 'mills & b.' → 'macmillan' (score: 42.10526315789473)
❌ No strong match: 'routledge' → 'constable ' (score: 42.10526315789473)
❌ No strong match: 'sweet & m.' → 'heinemann' (score: 31.57894736842105)
❌ No strong match: 'e. arnold' → 'heinemann' (score: 33.333333333333336)
❌ No strong match: 'churchill