# Phase 0A: Setup - Install Libraries

In [1]:
%pip install folium geopandas

Note: you may need to restart the kernel to use updated packages.


# Phase 0B: Setup - Import Libraries

In [2]:
import pandas as pd
import nltk
import re
import requests
import json
import io
import folium
import geopandas as gpd
import os
import time

# Phase 1A: Load Dataset

In [3]:
# Display 2 sample DataFrame rows
noscemus_metadata = pd.read_csv("https://raw.githubusercontent.com/CCS-ZCU/noscemus_ETF/refs/heads/master/data/metadata_table_long.csv")
noscemus_metadata.head(2)

Unnamed: 0,Author,Full title,In,Year,Place,Publisher/Printer,Era,Form/Genre,Discipline/Content,Original,...,Of interest to,Transkribus text available,Written by,Library and Signature,ids,id,date_min,date_max,filename,file_year
0,"Achrelius, Daniel",Scientiarum magnes recitatus publice anno 1690...,,1690,[Turku],Wall,17th century,Oration,"Mathematics, Astronomy/Astrology/Cosmography, ...",Scientiarum magnes(Google Books),...,"MK, JL",Yes,IT,,[705665],705665,1690.0,1690.0,"Achrelius,_Daniel_-_Scientiarum_magnes__Turku_...",1690.0
1,"Acidalius, Valens","Ad Iordanum Brunum Nolanum, Italum","Poematum Iani Lernutii, Iani Gulielmi, Valenti...",1603,"Liegnitz, Wrocław","Albert, David",17th century,Panegyric poem,Astronomy/Astrology/Cosmography,Ad Iordanum Brunum (1603)(CAMENA)Ad Iordanum B...,...,"MK, IT",Yes,MK,,[801745],801745,1603.0,1603.0,Janus_Lernutius_et_al__-_Poemata__Liegnitz_160...,1603.0


# Phase 1B: Inspect DataFrame Structure

In [4]:
# Display DataFrame Columns

print("\nColumns in noscemus_metadata:")
print(noscemus_metadata.columns.tolist())


Columns in noscemus_metadata:
['Author', 'Full title', 'In', 'Year', 'Place', 'Publisher/Printer', 'Era', 'Form/Genre', 'Discipline/Content', 'Original', 'Digital sourcebook', 'Description', 'References', 'Cited in', 'How to cite this entry', 'Internal notes', 'Of interest to', 'Transkribus text available', 'Written by', 'Library and Signature', 'ids', 'id', 'date_min', 'date_max', 'filename', 'file_year']


# Phase 1C: Examine 'Place' Column

In [5]:
# Inspect Potential Columns
# Replace 'candidate_column_name' with a column name from the list above
candidate_column_name = 'Place' # <-- CHANGE THIS VALUE 

if candidate_column_name in noscemus_metadata.columns:
    print(f"\nUnique values in '{candidate_column_name}':")
    # Display a sample of unique values and their counts
    print(noscemus_metadata[candidate_column_name].value_counts().head(30))
    print(f"\nNumber of unique values in '{candidate_column_name}': {noscemus_metadata[candidate_column_name].nunique()}")
    print(f"Number of missing values in '{candidate_column_name}': {noscemus_metadata[candidate_column_name].isnull().sum()}")
    # Show some raw examples of the data in this column
    print("\nSample raw entries (up to first 20 non-null):")
    print(noscemus_metadata[candidate_column_name].dropna().head(20).tolist())
else:
    print(f"Column '{candidate_column_name}' not found in DataFrame. Please choose from the list printed above.")


Unique values in 'Place':
Place
Paris                          69
Amsterdam                      49
Basel                          48
Venice                         48
London                         40
Leipzig                        36
Rome                           34
Zurich                         33
Leiden                         29
Frankfurt am Main              26
Göttingen                      25
Tübingen                       25
Nuremberg                      21
Bologna                        21
Strasbourg                     20
Lyon                           19
Wittenberg                     17
Innsbruck                      16
Cologne                        13
Padua                          13
Naples                         12
Florence                       12
Leiden, Stockholm, Erlangen    10
Halle                          10
Antwerp                        10
Oxford                          8
Copenhagen                      8
Vienna                          8
Bern           

# Phase 2A: Define Place Splitting Logic and Expand Rows

In [6]:
def split_places(place_string):
    if pd.isna(place_string) or not isinstance(place_string, str):
        return [] # Return empty list for NaN or non-string input
    # Split by comma, semicolon, or ampersand. Also handle cases like 'Place1 / Place2'.
    # Regex looks for one or more delimiters, surrounded by optional whitespace.
    places = re.split(r'\s*[,;&/]\s*', place_string)
    # Clean up each individual place name: strip whitespace, remove empty strings
    return [p.strip() for p in places if p and p.strip()] 

expanded_rows = []
if 'noscemus_metadata' in locals():
    print(f"Original number of rows in noscemus_metadata: {len(noscemus_metadata)}")
    for index, row in noscemus_metadata.iterrows():
        original_place_entry = row['Place']
        individual_places = split_places(original_place_entry)
        
        if not individual_places: # Handles NaN, empty strings, or strings that become empty after split
            # Keep the row as is, but ensure 'Place' is None or a consistent empty marker if it was NaN/empty
            new_row = row.copy()
            new_row['Place'] = None # Or np.nan, or an empty string, depending on desired handling for mapping
            expanded_rows.append(new_row)
        elif len(individual_places) == 1:
            # Single place, just copy the row with the cleaned single place name
            new_row = row.copy()
            new_row['Place'] = individual_places[0]
            expanded_rows.append(new_row)
        else:
            # Multiple places, create a new row for each
            for place_name in individual_places:
                new_row = row.copy()
                new_row['Place'] = place_name
                # Add original multi-place string for reference if needed
                new_row['Original_Multi_Place_Entry'] = original_place_entry 
                expanded_rows.append(new_row)
    
    expanded_noscemus_metadata = pd.DataFrame(expanded_rows)
    print(f"Number of rows after expansion: {len(expanded_noscemus_metadata)}")

    # Display a sample, especially focusing on some known multi-place entries to verify
    print("\nSample of expanded_noscemus_metadata (showing some original multi-place entries):")
    # Example: Find rows originating from 'Liegnitz, Wrocław' if it exists
    if 'Original_Multi_Place_Entry' in expanded_noscemus_metadata.columns:
        sample_multi = expanded_noscemus_metadata[expanded_noscemus_metadata['Original_Multi_Place_Entry'] == 'Liegnitz, Wrocław']
        if not sample_multi.empty:
            print(sample_multi[['id', 'Full title', 'Place', 'Original_Multi_Place_Entry']].head())
        else:
            print("Could not find 'Liegnitz, Wrocław' in Original_Multi_Place_Entry for sample.")
        # Show general head as well
        print("\nGeneral head of expanded data:")
        print(expanded_noscemus_metadata[['id', 'Full title', 'Place', 'Original_Multi_Place_Entry' if 'Original_Multi_Place_Entry' in expanded_noscemus_metadata.columns else 'Place']].head())
    else:
        print("\nGeneral head of expanded data (Original_Multi_Place_Entry column not created, likely no multi-place entries found):")
        print(expanded_noscemus_metadata[['id', 'Full title', 'Place']].head())
else:
    print("Error: noscemus_metadata DataFrame not found. Please load it first.")
    expanded_noscemus_metadata = pd.DataFrame() # Initialize empty to avoid errors later

Original number of rows in noscemus_metadata: 975
Number of rows after expansion: 1030

Sample of expanded_noscemus_metadata (showing some original multi-place entries):
       id                          Full title     Place  \
1  801745  Ad Iordanum Brunum Nolanum, Italum  Liegnitz   
1  801745  Ad Iordanum Brunum Nolanum, Italum   Wrocław   

  Original_Multi_Place_Entry  
1          Liegnitz, Wrocław  
1          Liegnitz, Wrocław  

General head of expanded data:
       id                                         Full title       Place  \
0  705665  Scientiarum magnes recitatus publice anno 1690...     [Turku]   
1  801745                 Ad Iordanum Brunum Nolanum, Italum    Liegnitz   
1  801745                 Ad Iordanum Brunum Nolanum, Italum     Wrocław   
2  713323  De natura novi orbis libri duo et De promulgat...   Salamanca   
3  693148  Vitae Germanorum medicorum, qui saeculo superi...  Heidelberg   

  Original_Multi_Place_Entry  
0                        NaN  
1       

# Phase 2B: Extract Unique Place Names for Geocoding

In [7]:
# Ensure 'expanded_noscemus_metadata' is available and populated from the previous cell.
if 'expanded_noscemus_metadata' in locals() and not expanded_noscemus_metadata.empty:
    actual_publication_place_column = 'Place' # This is the column with individual place names
    places_series = expanded_noscemus_metadata[actual_publication_place_column].astype(str).str.strip()
    unique_raw_places = places_series.dropna().unique() # Important to dropna here
    print(f"Found {len(unique_raw_places)} unique raw place mentions from '{actual_publication_place_column}' in the expanded data.")
    print("Sample of raw places (first 50 from expanded data):")
    print(unique_raw_places[:50])
else:
    print("Error: expanded_noscemus_metadata is not available or empty. Please ensure the 'Expand Multi-Location Rows' cell ran successfully.")
    # Initialize empty to prevent errors in subsequent cells, or handle appropriately
    places_series = pd.Series(dtype=str) 
    unique_raw_places = []

Found 166 unique raw place mentions from 'Place' in the expanded data.
Sample of raw places (first 50 from expanded data):
['[Turku]' 'Liegnitz' 'Wrocław' 'Salamanca' 'Heidelberg' 'London' 'Oxford'
 'Lund' 'Strasbourg' 'Basel' 'Bologna' 'Leipzig' 'Zurich' 'Venice' 'Rome'
 'Herborn' 'Frankfurt am Main' 'Turin' 'Florence' 'Alcalá de Henares'
 'Leiden' 'Innsbruck' 'Westminster Abbey' 'Paris' 'Cambridge' '[Landshut]'
 '[Ingolstadt]' 'Milan' 'Bergamo' 'Stuttgart' 'Perugia' 'Lyon' 's.l.'
 'Amsterdam' '[Wittenberg]' 'Copenhagen' 'Padua' '[Padua]' 'Rimini'
 'Büdingen' 'Königsberg' 'Uppsala' 'Stockholm' 'Turku' 'Desau' 'Würzburg'
 'Saint Petersburg' 'Antwerp' 'Graz' 'Aachen']


# Phase 3A: Geocode Unique Place Names

In [8]:
GEONAMES_USERNAME = "utaysi"  # Your Geonames username
raw_geocoded_cache_file = 'raw_geocoded_places_cache.csv'

def get_coordinates(place_name, username):
    if not place_name or pd.isna(place_name):
        return None, None, None, None
    # Ensure place_name is a string for requests.utils.quote
    place_name_str = str(place_name)
    try:
        # Initial attempt: prioritize populated places (featureClass=P)
        url = f"http://api.geonames.org/searchJSON?q={requests.utils.quote(place_name_str)}&maxRows=1&featureClass=P&username={username}"
        response = requests.get(url, timeout=15)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        if data.get('geonames') and len(data['geonames']) > 0:
            top_result = data['geonames'][0]
            return float(top_result['lat']), float(top_result['lng']), top_result.get('name'), top_result.get('countryName')
        else:
            # Fallback: search without featureClass if no populated place found or if initial result is empty
            # This helps with broader terms or historical names that might not be classed as 'P'
            url_fallback = f"http://api.geonames.org/searchJSON?q={requests.utils.quote(place_name_str)}&maxRows=1&username={username}"
            # print(f"Retrying without featureClass for: {place_name_str}") # Optional: for debugging
            response_fallback = requests.get(url_fallback, timeout=15)
            response_fallback.raise_for_status()
            data_fallback = response_fallback.json()
            if data_fallback.get('geonames') and len(data_fallback['geonames']) > 0:
                top_result_fallback = data_fallback['geonames'][0]
                # print(f"Fallback success for {place_name_str}: Found {top_result_fallback.get('name')}") # Optional
                return float(top_result_fallback['lat']), float(top_result_fallback['lng']), top_result_fallback.get('name'), top_result_fallback.get('countryName')
            # print(f"Place not found by Geonames (even after fallback): {place_name_str}") # Optional
            return None, None, None, None
    except requests.exceptions.Timeout:
        print(f"API request timed out for {place_name_str}")
        return None, None, None, None
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for {place_name_str}: {http_err} - Response: {response.text[:200]}...")
        return None, None, None, None
    except requests.exceptions.RequestException as req_err:
        print(f"API request failed for {place_name_str}: {req_err}")
        return None, None, None, None
    except ValueError as json_err: # Handles JSON decoding errors
        print(f"JSON decoding failed for {place_name_str} (response: {response.text[:200]}...): {json_err}")
        return None, None, None, None

# Check for cached data first
if os.path.exists(raw_geocoded_cache_file):
    print(f"Loading raw geocoded data from cache: {raw_geocoded_cache_file}")
    raw_geocoded_df = pd.read_csv(raw_geocoded_cache_file)
    # Ensure all expected columns are present, fill with NA if not
    expected_cols = ['raw_place', 'geoname_name', 'latitude', 'longitude', 'country']
    for col in expected_cols:
        if col not in raw_geocoded_df.columns:
            raw_geocoded_df[col] = pd.NA
else:
    print(f"No cache file found ({raw_geocoded_cache_file}). Geocoding raw places...")
    raw_geocoded_data = []
    if 'places_series' in locals():
        unique_raw_places = places_series.dropna().unique() # Use dropna() before unique()
        print(f"Geocoding {len(unique_raw_places)} unique raw place names...")
        for i, place in enumerate(unique_raw_places):
            if str(place).strip() == "nan" or str(place).strip() == "": # Skip if place is 'nan' string or empty after strip
                # print(f"Skipping invalid place entry: '{place}'") # Optional
                lat, lon, geoname_name, country = None, None, None, None
            else:
                if (i+1) % 20 == 0:
                    print(f"Processed {i+1}/{len(unique_raw_places)} places...")
                lat, lon, geoname_name, country = get_coordinates(place, GEONAMES_USERNAME)
            
            raw_geocoded_data.append({'raw_place': place, 
                                      'geoname_name': geoname_name, 
                                      'latitude': lat, 
                                      'longitude': lon, 
                                      'country': country})
            time.sleep(0.1) # 100ms delay to be respectful to the API

        raw_geocoded_df = pd.DataFrame(raw_geocoded_data)
        raw_geocoded_df.to_csv(raw_geocoded_cache_file, index=False)
        print(f"Saved raw geocoded data to cache: {raw_geocoded_cache_file}")
    else:
        print("Error: 'places_series' not defined. Please ensure the previous cells (especially 'cline_extract_place_column') have been run.")
        raw_geocoded_df = pd.DataFrame(columns=['raw_place', 'geoname_name', 'latitude', 'longitude', 'country']) # Create empty df

if not raw_geocoded_df.empty:
    print(f"\nSuccessfully geocoded {raw_geocoded_df['latitude'].notna().sum()} places out of {len(raw_geocoded_df)} unique raw names processed.")
    print("\nSample of geocoded data (first 20 rows):")
    print(raw_geocoded_df.head(20))
    
    print("\nPlaces that were NOT found by Geonames (sample):")
    not_found_sample = raw_geocoded_df[raw_geocoded_df['latitude'].isna()]['raw_place'].unique()
    print(not_found_sample[:20]) # Show up to 20 unique not found raw places
    print(f"Total unique raw places not found: {len(not_found_sample)}")
else:
    print("\nraw_geocoded_df is empty. Check for errors in previous steps or API calls.")

Loading raw geocoded data from cache: raw_geocoded_places_cache.csv

Successfully geocoded 162 places out of 166 unique raw names processed.

Sample of geocoded data (first 20 rows):
            raw_place       geoname_name  latitude  longitude         country
0             [Turku]              Turku  60.45148   22.26869         Finland
1            Liegnitz            Legnica  51.21006   16.16190          Poland
2             Wrocław            Wroclaw  51.10000   17.03333          Poland
3           Salamanca          Salamanca  40.96882   -5.66388           Spain
4          Heidelberg         Heidelberg  49.40768    8.69079         Germany
5              London             London  51.50853   -0.12574  United Kingdom
6              Oxford             Oxford  39.50700  -84.74523   United States
7                Lund               Lund  55.70584   13.19321          Sweden
8          Strasbourg         Strasbourg  48.58392    7.74553          France
9               Basel              Ba

# Phase 3B: Add Primary Coordinates to Original Metadata

In [9]:
if 'noscemus_metadata' in locals() and 'raw_geocoded_df' in locals() and 'split_places' in globals():
    print("Adding primary coordinates to the original noscemus_metadata DataFrame...")

    # Create a dictionary from raw_geocoded_df for faster lookups
    # We only need raw_place, latitude, longitude, geoname_name, country
    geocoded_lookup = raw_geocoded_df.set_index('raw_place')[['latitude', 'longitude', 'geoname_name', 'country']].copy()

    # Initialize new columns in noscemus_metadata
    noscemus_metadata['primary_latitude'] = pd.NA
    noscemus_metadata['primary_longitude'] = pd.NA
    noscemus_metadata['primary_geoname_name'] = pd.NA
    noscemus_metadata['primary_country'] = pd.NA
    
    rows_updated_count = 0

    for index, row in noscemus_metadata.iterrows():
        original_place_entry = row['Place']
        # Use the existing split_places function (defined in Phase 2A)
        individual_places = split_places(original_place_entry) 

        primary_place_name = None
        if individual_places: # If the list of places is not empty
            primary_place_name = individual_places[0] # Take the first place

        if primary_place_name and primary_place_name in geocoded_lookup.index:
            # Ensure that we handle cases where a place might be in the index but has NA coordinates
            coords_series = geocoded_lookup.loc[primary_place_name]
            if pd.notna(coords_series['latitude']) and pd.notna(coords_series['longitude']):
                 noscemus_metadata.loc[index, 'primary_latitude'] = coords_series['latitude']
                 noscemus_metadata.loc[index, 'primary_longitude'] = coords_series['longitude']
                 noscemus_metadata.loc[index, 'primary_geoname_name'] = coords_series['geoname_name']
                 noscemus_metadata.loc[index, 'primary_country'] = coords_series['country']
                 rows_updated_count += 1
        # If primary_place_name is None, not in lookup, or has NA coordinates, the columns remain NA (as initialized)

    print(f"Finished adding primary coordinates. {rows_updated_count} rows in noscemus_metadata were updated with primary coordinates.")
    print("\nNew columns added: 'primary_latitude', 'primary_longitude', 'primary_geoname_name', 'primary_country'")
    
    print("\nSample of noscemus_metadata with new primary coordinate columns (first 5 rows with data):")
    # Show rows where primary_latitude is not NA, and include relevant columns
    sample_df = noscemus_metadata[noscemus_metadata['primary_latitude'].notna()]
    if not sample_df.empty:
        print(sample_df[['id', 'Full title', 'Place', 'primary_latitude', 'primary_longitude', 'primary_geoname_name', 'primary_country']].head())
    else:
        print("No rows found with primary coordinates to display in sample.")
        
    print("\nSample of noscemus_metadata where primary coordinates might be missing (first 5 rows):")
    sample_missing_df = noscemus_metadata[noscemus_metadata['primary_latitude'].isna()]
    if not sample_missing_df.empty:
        print(sample_missing_df[['id', 'Full title', 'Place']].head())
    else:
        print("All rows appear to have primary coordinates (or the DataFrame is empty).")

else:
    print("Error: 'noscemus_metadata' or 'raw_geocoded_df' not found, or 'split_places' function not defined. Please ensure previous cells ran successfully.")

Adding primary coordinates to the original noscemus_metadata DataFrame...
Finished adding primary coordinates. 967 rows in noscemus_metadata were updated with primary coordinates.

New columns added: 'primary_latitude', 'primary_longitude', 'primary_geoname_name', 'primary_country'

Sample of noscemus_metadata with new primary coordinate columns (first 5 rows with data):
       id                                         Full title  \
0  705665  Scientiarum magnes recitatus publice anno 1690...   
1  801745                 Ad Iordanum Brunum Nolanum, Italum   
2  713323  De natura novi orbis libri duo et De promulgat...   
3  693148  Vitae Germanorum medicorum, qui saeculo superi...   
4  769230  Ad insignissimum virum dominum Thomam Burnettu...   

               Place primary_latitude primary_longitude primary_geoname_name  \
0            [Turku]         60.45148          22.26869                Turku   
1  Liegnitz, Wrocław         51.21006           16.1619              Legnica   
2

# Phase 4A: Merge Geocoded Data with Expanded Metadata

In [10]:
# Ensure both DataFrames are loaded and available
if 'expanded_noscemus_metadata' in locals() and 'raw_geocoded_df' in locals():
    if not expanded_noscemus_metadata.empty and not raw_geocoded_df.empty:
        # Merge the geocoded data (lat, lon) back into the expanded metadata
        # The 'Place' column in expanded_noscemus_metadata should match 'raw_place' in raw_geocoded_df
        metadata_with_coords = pd.merge(
            expanded_noscemus_metadata,
            raw_geocoded_df[['raw_place', 'latitude', 'longitude', 'geoname_name', 'country']],
            left_on='Place',
            right_on='raw_place',
            how='left'
        )

        # Drop the redundant 'raw_place' column if it's different from 'Place' or just to clean up
        if 'raw_place' in metadata_with_coords.columns and 'Place' in metadata_with_coords.columns:
             metadata_with_coords.drop(columns=['raw_place'], inplace=True, errors='ignore')


        print(f"Successfully merged geocoded data. New DataFrame 'metadata_with_coords' has {len(metadata_with_coords)} rows.")
        print(f"Number of entries with valid coordinates: {metadata_with_coords['latitude'].notna().sum()}")
        
        print("\nSample of merged data (first 5 rows with coordinates):")
        print(metadata_with_coords[metadata_with_coords['latitude'].notna()][['id', 'Full title', 'Place', 'file_year', 'latitude', 'longitude', 'geoname_name']].head())
        
        print("\nSample of entries that might be missing coordinates (if any):")
        print(metadata_with_coords[metadata_with_coords['latitude'].isna()][['id', 'Full title', 'Place']].head())
    else:
        print("Error: One or both DataFrames (expanded_noscemus_metadata, raw_geocoded_df) are empty.")
        metadata_with_coords = pd.DataFrame() # Initialize empty to avoid errors
else:
    print("Error: 'expanded_noscemus_metadata' or 'raw_geocoded_df' not found. Please ensure previous cells ran successfully.")
    metadata_with_coords = pd.DataFrame() # Initialize empty to avoid errors

Successfully merged geocoded data. New DataFrame 'metadata_with_coords' has 1030 rows.
Number of entries with valid coordinates: 1026

Sample of merged data (first 5 rows with coordinates):
       id                                         Full title       Place  \
0  705665  Scientiarum magnes recitatus publice anno 1690...     [Turku]   
1  801745                 Ad Iordanum Brunum Nolanum, Italum    Liegnitz   
2  801745                 Ad Iordanum Brunum Nolanum, Italum     Wrocław   
3  713323  De natura novi orbis libri duo et De promulgat...   Salamanca   
4  693148  Vitae Germanorum medicorum, qui saeculo superi...  Heidelberg   

   file_year  latitude  longitude geoname_name  
0     1690.0  60.45148   22.26869        Turku  
1     1603.0  51.21006   16.16190      Legnica  
2     1603.0  51.10000   17.03333      Wroclaw  
3     1589.0  40.96882   -5.66388    Salamanca  
4     1620.0  49.40768    8.69079   Heidelberg  

Sample of entries that might be missing coordinates (if an

# Phase 4B: Create Interactive Map with Folium (Part 1: Pinpoints)

In [11]:
if 'metadata_with_coords' in locals() and not metadata_with_coords.empty:
    # Filter out rows where latitude or longitude is missing
    map_data = metadata_with_coords.dropna(subset=['latitude', 'longitude'])
    
    if not map_data.empty:
        print(f"Plotting {len(map_data)} works on the map.")
        
        # Create a Folium map centered on Europe
        # Adjust location and zoom_start as needed
        europe_center = [50, 15] # Latitude, Longitude for a general Europe center
        interactive_map_part1 = folium.Map(location=europe_center, zoom_start=4)

        # Add markers for each work
        for idx, row in map_data.iterrows():
            # Prepare popup text
            # Ensure all components of popup are strings and handle potential NaNs
            title = str(row['Full title']) if pd.notna(row['Full title']) else "N/A"
            place = str(row['Place']) if pd.notna(row['Place']) else "N/A"
            year = str(int(row['file_year'])) if pd.notna(row['file_year']) else "N/A"
            
            popup_html = f"""
            <b>Title:</b> {title}<br>
            <b>Place:</b> {place}<br>
            <b>Year:</b> {year}<br>
            <b>Lat/Lon:</b> {row['latitude']:.2f}, {row['longitude']:.2f}
            """
            iframe = folium.IFrame(popup_html, width=300, height=100)
            popup = folium.Popup(iframe, max_width=300)
            
            folium.CircleMarker(
                location=[row['latitude'], row['longitude']],
                radius=5, # Small circle marker
                popup=popup,
                tooltip=f"{title[:50]}... ({place})", # Shorter tooltip
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.6
            ).add_to(interactive_map_part1)

        # Display the map
        # In a Jupyter Notebook, the map object itself will render when it's the last expression in a cell
        print("Map generation complete. The map should display below.")
        # interactive_map_part1 # This line will display the map
    else:
        print("No data with valid coordinates available to plot on the map.")
else:
    print("Error: 'metadata_with_coords' DataFrame not found or is empty. Please run the previous cell (Phase 4A).")

# To display the map, ensure this cell's last line is 'interactive_map_part1' (uncommented if necessary)
# For now, let's assign it to a variable and then explicitly call it to ensure it's the last thing.
if 'interactive_map_part1' in locals():
    from IPython.display import display # Make sure display is imported
    display(interactive_map_part1)
elif 'map_data' in locals() and map_data.empty:
    print("Map not generated as there was no data to plot.")
else:
    print("Map object 'interactive_map_part1' was not created, likely due to an error in the preceding logic.")

Plotting 1026 works on the map.
Map generation complete. The map should display below.


# Phase 4C: Prepare Data for Time Slider

In [12]:
if 'metadata_with_coords' in locals() and not metadata_with_coords.empty:
    print("Preparing data for time-slider map...")
    # Make a copy to avoid modifying the original DataFrame used by other maps
    map_data_for_time = metadata_with_coords.copy()

    # Clean 'file_year': convert to numeric, coercing errors to NaT/NaN
    map_data_for_time['file_year_numeric'] = pd.to_numeric(map_data_for_time['file_year'], errors='coerce')

    # Drop rows where 'file_year_numeric' is NaN (i.e., couldn't be converted or was originally NaN)
    # Also ensure latitude and longitude are present
    original_rows = len(map_data_for_time)
    map_data_cleaned_time = map_data_for_time.dropna(subset=['file_year_numeric', 'latitude', 'longitude'])
    dropped_rows = original_rows - len(map_data_cleaned_time)
    print(f"Dropped {dropped_rows} rows due to missing/invalid year or missing coordinates.")

    if not map_data_cleaned_time.empty:
        # Convert 'file_year_numeric' to integer
        map_data_cleaned_time.loc[:, 'file_year_numeric'] = map_data_cleaned_time['file_year_numeric'].astype(int)
        
        min_year = map_data_cleaned_time['file_year_numeric'].min()
        max_year = map_data_cleaned_time['file_year_numeric'].max()
        print(f"Data prepared for time slider: {len(map_data_cleaned_time)} entries.")
        print(f"Year range in data: {min_year} - {max_year}")
        
        # Display a sample of the cleaned data
        print("\nSample of cleaned data for time slider (first 5 rows):")
        print(map_data_cleaned_time[['id', 'Full title', 'Place', 'file_year_numeric', 'latitude', 'longitude']].head())
    else:
        print("No valid data remaining after cleaning for the time slider.")
        # Ensure map_data_cleaned_time exists as an empty DataFrame if all rows were dropped
        map_data_cleaned_time = pd.DataFrame(columns=map_data_for_time.columns.tolist() + ['file_year_numeric'])

else:
    print("Error: 'metadata_with_coords' DataFrame not found or is empty. Please run Phase 4A first.")
    # Initialize map_data_cleaned_time as an empty DataFrame to prevent errors in the next cell
    map_data_cleaned_time = pd.DataFrame()

Preparing data for time-slider map...
Dropped 7 rows due to missing/invalid year or missing coordinates.
Data prepared for time slider: 1023 entries.
Year range in data: 1472.0 - 1932.0

Sample of cleaned data for time slider (first 5 rows):
       id                                         Full title       Place  \
0  705665  Scientiarum magnes recitatus publice anno 1690...     [Turku]   
1  801745                 Ad Iordanum Brunum Nolanum, Italum    Liegnitz   
2  801745                 Ad Iordanum Brunum Nolanum, Italum     Wrocław   
3  713323  De natura novi orbis libri duo et De promulgat...   Salamanca   
4  693148  Vitae Germanorum medicorum, qui saeculo superi...  Heidelberg   

   file_year_numeric  latitude  longitude  
0             1690.0  60.45148   22.26869  
1             1603.0  51.21006   16.16190  
2             1603.0  51.10000   17.03333  
3             1589.0  40.96882   -5.66388  
4             1620.0  49.40768    8.69079  


# Phase 4D: Interactive Map with ipywidgets Year Range Input (Enter to Update)

In [13]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import folium
import pandas as pd

if 'map_data_cleaned_time' in locals() and not map_data_cleaned_time.empty:
    
    min_data_year = int(map_data_cleaned_time['file_year_numeric'].min())
    max_data_year = int(map_data_cleaned_time['file_year_numeric'].max())

    # These prints go to standard cell output, not a widget. This is fine.
    print(f"Full data year range available: {min_data_year} - {max_data_year}")
    print("Enter start and end years, then press Enter in either box to update the map.")

    map_output = widgets.Output()
    message_output = widgets.Output()

    start_year_input = widgets.IntText(
        value=min_data_year,
        description='Start Year:',
        disabled=False,
        style={'description_width': 'initial'}
    )
    end_year_input = widgets.IntText(
        value=min_data_year + 9, # Default to a 10-year span
        description='End Year:',
        disabled=False,
        style={'description_width': 'initial'}
    )

    def handle_submit(widget_instance): # widget_instance is not used but required by on_submit
        s_year = start_year_input.value
        e_year = end_year_input.value

        # --- Validation Phase ---
        with message_output:
            clear_output() # Clear previous validation messages
            valid_input = True
            if not (isinstance(s_year, int) and isinstance(e_year, int)):
                print("Error: Start and End years must be integers.")
                valid_input = False
            elif s_year > e_year:
                print("Error: Start Year cannot be greater than End Year.")
                valid_input = False
            
            if valid_input and (s_year < min_data_year or e_year > max_data_year):
                print(f"Warning: Specified range ({s_year}-{e_year}) is partially outside available data range ({min_data_year}-{max_data_year}). Results may be limited or empty.")
            
            if not valid_input:
                return # Stop processing, map_output remains untouched.

        # --- Map Update Phase (only if input is valid) ---
        with map_output:
            clear_output(wait=True) # Clear previous map and any messages in map_output
            
            # Informational messages about the current map generation, inside map_output
            print(f"Filtering for years: {s_year} - {e_year}")
            
            window_data = map_data_cleaned_time[
                (map_data_cleaned_time['file_year_numeric'] >= s_year) &
                (map_data_cleaned_time['file_year_numeric'] <= e_year)
            ]
            
            print(f"Found {len(window_data)} works in this year range.")

            europe_center = [50, 15]
            interactive_map_range = folium.Map(location=europe_center, zoom_start=4)

            if not window_data.empty:
                for idx, row in window_data.iterrows():
                    title = str(row['Full title']) if pd.notna(row['Full title']) else "N/A"
                    place = str(row['Place']) if pd.notna(row['Place']) else "N/A"
                    year = str(int(row['file_year_numeric']))

                    popup_html = f"""
                    <b>Title:</b> {title}<br>
                    <b>Place:</b> {place}<br>
                    <b>Year:</b> {year}<br>
                    <b>Lat/Lon:</b> {row['latitude']:.2f}, {row['longitude']:.2f}
                    """
                    popup = folium.Popup(popup_html, max_width=300)
                    
                    folium.CircleMarker(
                        location=[row['latitude'], row['longitude']],
                        radius=5,
                        popup=popup,
                        tooltip=f"{title[:50]}... ({place}, {year})",
                        color='darkblue', 
                        fill=True,
                        fill_color='darkblue',
                        fill_opacity=0.7
                    ).add_to(interactive_map_range)
                
                display(interactive_map_range)
            else:
                print("No works found for the selected year range.")
                display(interactive_map_range) # Show empty map
    
    start_year_input.observe(handle_submit, names='value')
    end_year_input.observe(handle_submit, names='value')

    input_widgets = widgets.HBox([start_year_input, end_year_input])
    
    display(input_widgets)
    display(message_output) 
    display(map_output)
    
    # Initial map load.
    # For the very first load, if the default values are somehow invalid (they shouldn't be),
    # an error will show in message_output and map_output will be empty.
    # This is acceptable. If defaults are valid, map shows.
    handle_submit(None) 

elif 'map_data_cleaned_time' in locals() and map_data_cleaned_time.empty:
    print("Map controls not generated as there was no cleaned data to plot (map_data_cleaned_time is empty).")
else:
    print("Error: 'map_data_cleaned_time' DataFrame not found. Please run Phase 4C first.")

Full data year range available: 1472 - 1932
Enter start and end years, then press Enter in either box to update the map.


HBox(children=(IntText(value=1472, description='Start Year:', style=DescriptionStyle(description_width='initia…

Output()

Output()

# Phase 5: Save Enriched Metadata

In [14]:
if 'noscemus_metadata' in locals():
    output_filename = 'noscemus_metadata_with_primary_coords.csv'
    try:
        noscemus_metadata.to_csv(output_filename, index=False)
        print(f"Successfully saved the enriched metadata to: {output_filename}")
        print(f"The DataFrame has {len(noscemus_metadata)} rows and {len(noscemus_metadata.columns)} columns.")
        print("Columns include: 'primary_latitude', 'primary_longitude', 'primary_geoname_name', 'primary_country'.")
    except Exception as e:
        print(f"An error occurred while saving the DataFrame: {e}")
else:
    print("Error: 'noscemus_metadata' DataFrame not found. Cannot save.")

Successfully saved the enriched metadata to: noscemus_metadata_with_primary_coords.csv
The DataFrame has 975 rows and 30 columns.
Columns include: 'primary_latitude', 'primary_longitude', 'primary_geoname_name', 'primary_country'.
