In [None]:
# Phase 0A: Setup - Install Libraries
%pip install folium geopandas

In [None]:
# Phase 0B: Setup - Import Libraries
import pandas as pd
import nltk
import re
import requests
import json
import io
import folium
import geopandas as gpd
import os
import time

In [None]:
# Phase 1A: Load Dataset
# Display 2 sample DataFrame rows
noscemus_metadata = pd.read_csv("https://raw.githubusercontent.com/CCS-ZCU/noscemus_ETF/refs/heads/master/data/metadata_table_long.csv")
noscemus_metadata.head(2)

In [None]:
# Phase 1B: Inspect DataFrame Structure
# Display DataFrame Columns

print("\nColumns in noscemus_metadata:")
print(noscemus_metadata.columns.tolist())

In [None]:
# Phase 1C: Examine 'Place' Column
# Inspect Potential Columns
# Replace 'candidate_column_name' with a column name from the list above
candidate_column_name = 'Place' # <-- CHANGE THIS VALUE 

if candidate_column_name in noscemus_metadata.columns:
    print(f"\nUnique values in '{candidate_column_name}':")
    # Display a sample of unique values and their counts
    print(noscemus_metadata[candidate_column_name].value_counts().head(30))
    print(f"\nNumber of unique values in '{candidate_column_name}': {noscemus_metadata[candidate_column_name].nunique()}")
    print(f"Number of missing values in '{candidate_column_name}': {noscemus_metadata[candidate_column_name].isnull().sum()}")
    # Show some raw examples of the data in this column
    print("\nSample raw entries (up to first 20 non-null):")
    print(noscemus_metadata[candidate_column_name].dropna().head(20).tolist())
else:
    print(f"Column '{candidate_column_name}' not found in DataFrame. Please choose from the list printed above.")

In [None]:
# Phase 2A: Define Place Splitting Logic and Expand Rows

def split_places(place_string):
    if pd.isna(place_string) or not isinstance(place_string, str):
        return [] # Return empty list for NaN or non-string input
    # Split by comma, semicolon, or ampersand. Also handle cases like 'Place1 / Place2'.
    # Regex looks for one or more delimiters, surrounded by optional whitespace.
    places = re.split(r'\s*[,;&/]\s*', place_string)
    # Clean up each individual place name: strip whitespace, remove empty strings
    return [p.strip() for p in places if p and p.strip()] 

expanded_rows = []
if 'noscemus_metadata' in locals():
    print(f"Original number of rows in noscemus_metadata: {len(noscemus_metadata)}")
    for index, row in noscemus_metadata.iterrows():
        original_place_entry = row['Place']
        individual_places = split_places(original_place_entry)
        
        if not individual_places: # Handles NaN, empty strings, or strings that become empty after split
            # Keep the row as is, but ensure 'Place' is None or a consistent empty marker if it was NaN/empty
            new_row = row.copy()
            new_row['Place'] = None # Or np.nan, or an empty string, depending on desired handling for mapping
            expanded_rows.append(new_row)
        elif len(individual_places) == 1:
            # Single place, just copy the row with the cleaned single place name
            new_row = row.copy()
            new_row['Place'] = individual_places[0]
            expanded_rows.append(new_row)
        else:
            # Multiple places, create a new row for each
            for place_name in individual_places:
                new_row = row.copy()
                new_row['Place'] = place_name
                # Add original multi-place string for reference if needed
                new_row['Original_Multi_Place_Entry'] = original_place_entry 
                expanded_rows.append(new_row)
    
    expanded_noscemus_metadata = pd.DataFrame(expanded_rows)
    print(f"Number of rows after expansion: {len(expanded_noscemus_metadata)}")

    # Display a sample, especially focusing on some known multi-place entries to verify
    print("\nSample of expanded_noscemus_metadata (showing some original multi-place entries):")
    # Example: Find rows originating from 'Liegnitz, Wrocław' if it exists
    if 'Original_Multi_Place_Entry' in expanded_noscemus_metadata.columns:
        sample_multi = expanded_noscemus_metadata[expanded_noscemus_metadata['Original_Multi_Place_Entry'] == 'Liegnitz, Wrocław']
        if not sample_multi.empty:
            print(sample_multi[['id', 'Full title', 'Place', 'Original_Multi_Place_Entry']].head())
        else:
            print("Could not find 'Liegnitz, Wrocław' in Original_Multi_Place_Entry for sample.")
        # Show general head as well
        print("\nGeneral head of expanded data:")
        print(expanded_noscemus_metadata[['id', 'Full title', 'Place', 'Original_Multi_Place_Entry' if 'Original_Multi_Place_Entry' in expanded_noscemus_metadata.columns else 'Place']].head())
    else:
        print("\nGeneral head of expanded data (Original_Multi_Place_Entry column not created, likely no multi-place entries found):")
        print(expanded_noscemus_metadata[['id', 'Full title', 'Place']].head())
else:
    print("Error: noscemus_metadata DataFrame not found. Please load it first.")
    expanded_noscemus_metadata = pd.DataFrame() # Initialize empty to avoid errors later


In [None]:
# Phase 2B: Extract Unique Place Names for Geocoding
# Ensure 'expanded_noscemus_metadata' is available and populated from the previous cell.
if 'expanded_noscemus_metadata' in locals() and not expanded_noscemus_metadata.empty:
    actual_publication_place_column = 'Place' # This is the column with individual place names
    places_series = expanded_noscemus_metadata[actual_publication_place_column].astype(str).str.strip()
    unique_raw_places = places_series.dropna().unique() # Important to dropna here
    print(f"Found {len(unique_raw_places)} unique raw place mentions from '{actual_publication_place_column}' in the expanded data.")
    print("Sample of raw places (first 50 from expanded data):")
    print(unique_raw_places[:50])
else:
    print("Error: expanded_noscemus_metadata is not available or empty. Please ensure the 'Expand Multi-Location Rows' cell ran successfully.")
    # Initialize empty to prevent errors in subsequent cells, or handle appropriately
    places_series = pd.Series(dtype=str) 
    unique_raw_places = []

In [None]:
# Phase 3A: Geocode Unique Place Names

GEONAMES_USERNAME = "utaysi"  # Your Geonames username
raw_geocoded_cache_file = 'raw_geocoded_places_cache.csv'

def get_coordinates(place_name, username):
    if not place_name or pd.isna(place_name):
        return None, None, None, None
    # Ensure place_name is a string for requests.utils.quote
    place_name_str = str(place_name)
    try:
        # Initial attempt: prioritize populated places (featureClass=P)
        url = f"http://api.geonames.org/searchJSON?q={requests.utils.quote(place_name_str)}&maxRows=1&featureClass=P&username={username}"
        response = requests.get(url, timeout=15)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        if data.get('geonames') and len(data['geonames']) > 0:
            top_result = data['geonames'][0]
            return float(top_result['lat']), float(top_result['lng']), top_result.get('name'), top_result.get('countryName')
        else:
            # Fallback: search without featureClass if no populated place found or if initial result is empty
            # This helps with broader terms or historical names that might not be classed as 'P'
            url_fallback = f"http://api.geonames.org/searchJSON?q={requests.utils.quote(place_name_str)}&maxRows=1&username={username}"
            # print(f"Retrying without featureClass for: {place_name_str}") # Optional: for debugging
            response_fallback = requests.get(url_fallback, timeout=15)
            response_fallback.raise_for_status()
            data_fallback = response_fallback.json()
            if data_fallback.get('geonames') and len(data_fallback['geonames']) > 0:
                top_result_fallback = data_fallback['geonames'][0]
                # print(f"Fallback success for {place_name_str}: Found {top_result_fallback.get('name')}") # Optional
                return float(top_result_fallback['lat']), float(top_result_fallback['lng']), top_result_fallback.get('name'), top_result_fallback.get('countryName')
            # print(f"Place not found by Geonames (even after fallback): {place_name_str}") # Optional
            return None, None, None, None
    except requests.exceptions.Timeout:
        print(f"API request timed out for {place_name_str}")
        return None, None, None, None
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for {place_name_str}: {http_err} - Response: {response.text[:200]}...")
        return None, None, None, None
    except requests.exceptions.RequestException as req_err:
        print(f"API request failed for {place_name_str}: {req_err}")
        return None, None, None, None
    except ValueError as json_err: # Handles JSON decoding errors
        print(f"JSON decoding failed for {place_name_str} (response: {response.text[:200]}...): {json_err}")
        return None, None, None, None

# Check for cached data first
if os.path.exists(raw_geocoded_cache_file):
    print(f"Loading raw geocoded data from cache: {raw_geocoded_cache_file}")
    raw_geocoded_df = pd.read_csv(raw_geocoded_cache_file)
    # Ensure all expected columns are present, fill with NA if not
    expected_cols = ['raw_place', 'geoname_name', 'latitude', 'longitude', 'country']
    for col in expected_cols:
        if col not in raw_geocoded_df.columns:
            raw_geocoded_df[col] = pd.NA
else:
    print(f"No cache file found ({raw_geocoded_cache_file}). Geocoding raw places...")
    raw_geocoded_data = []
    if 'places_series' in locals():
        unique_raw_places = places_series.dropna().unique() # Use dropna() before unique()
        print(f"Geocoding {len(unique_raw_places)} unique raw place names...")
        for i, place in enumerate(unique_raw_places):
            if str(place).strip() == "nan" or str(place).strip() == "": # Skip if place is 'nan' string or empty after strip
                # print(f"Skipping invalid place entry: '{place}'") # Optional
                lat, lon, geoname_name, country = None, None, None, None
            else:
                if (i+1) % 20 == 0:
                    print(f"Processed {i+1}/{len(unique_raw_places)} places...")
                lat, lon, geoname_name, country = get_coordinates(place, GEONAMES_USERNAME)
            
            raw_geocoded_data.append({'raw_place': place, 
                                      'geoname_name': geoname_name, 
                                      'latitude': lat, 
                                      'longitude': lon, 
                                      'country': country})
            time.sleep(0.1) # 100ms delay to be respectful to the API

        raw_geocoded_df = pd.DataFrame(raw_geocoded_data)
        raw_geocoded_df.to_csv(raw_geocoded_cache_file, index=False)
        print(f"Saved raw geocoded data to cache: {raw_geocoded_cache_file}")
    else:
        print("Error: 'places_series' not defined. Please ensure the previous cells (especially 'cline_extract_place_column') have been run.")
        raw_geocoded_df = pd.DataFrame(columns=['raw_place', 'geoname_name', 'latitude', 'longitude', 'country']) # Create empty df

if not raw_geocoded_df.empty:
    print(f"\nSuccessfully geocoded {raw_geocoded_df['latitude'].notna().sum()} places out of {len(raw_geocoded_df)} unique raw names processed.")
    print("\nSample of geocoded data (first 20 rows):")
    print(raw_geocoded_df.head(20))
    
    print("\nPlaces that were NOT found by Geonames (sample):")
    not_found_sample = raw_geocoded_df[raw_geocoded_df['latitude'].isna()]['raw_place'].unique()
    print(not_found_sample[:20]) # Show up to 20 unique not found raw places
    print(f"Total unique raw places not found: {len(not_found_sample)}")
else:
    print("\nraw_geocoded_df is empty. Check for errors in previous steps or API calls.")

In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6bafbaa7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: folium in ./.venv/lib/python3.12/site-packages (0.19.6)\n",
      "Requirement already satisfied: geopandas in ./.venv/lib/python3.12/site-packages (1.0.1)\n",
      "Requirement already satisfied: branca>=0.6.0 in ./.venv/lib/python3.12/site-packages (from folium) (0.8.1)\n",
      "Requirement already satisfied: jinja2>=2.9 in ./.venv/lib/python3.12/site-packages (from folium) (3.1.6)\n",
      "Requirement already satisfied: numpy in ./.venv/lib/python3.12/site-packages (from folium) (1.26.4)\n",
      "Requirement already satisfied: requests in ./.venv/lib/python3.12/site-packages (from folium) (2.32.3)\n",
      "Requirement already satisfied: xyzservices in ./.venv/lib/python3.12/site-packages (from folium) (2025.4.0)\n",
      "Requirement already satisfied: pyogrio>=0.7.2 in ./.venv/lib/python3.12/site-packages (from geopandas) (0.11.0)\n",
      "Requirement already satisfied: packaging in ./.venv/lib/python3.12/site-packages (from geopandas) (25.0)\n",
      "Requirement already satisfied: pandas>=1.4.0 in ./.venv/lib/python3.12/site-packages (from geopandas) (2.2.3)\n",
      "Requirement already satisfied: pyproj>=3.3.0 in ./.venv/lib/python3.12/site-packages (from geopandas) (3.7.1)\n",
      "Requirement already satisfied: shapely>=2.0.0 in ./.venv/lib/python3.12/site-packages (from geopandas) (2.1.1)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.12/site-packages (from jinja2>=2.9->folium) (3.0.2)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2.9.0.post0)\n",
      "Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2025.2)\n",
      "Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2025.2)\n",
      "Requirement already satisfied: certifi in ./.venv/lib/python3.12/site-packages (from pyogrio>=0.7.2->geopandas) (2025.4.26)\n",
      "Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=1.4.0->geopandas) (1.17.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.12/site-packages (from requests->folium) (3.4.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./.venv/lib/python3.12/site-packages (from requests->folium) (3.10)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.12/site-packages (from requests->folium) (2.4.0)\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "# Phase 0A: Setup - Install Libraries\n",
    "%pip install folium geopandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "361b2bdaa3269606",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-01T14:17:07.109340Z",
     "start_time": "2025-05-01T14:17:07.105530Z"
    }
   },
   "outputs": [],
   "source": [
    "# Phase 0B: Setup - Import Libraries\n",
    "import pandas as pd\n",
    "import nltk\n",
    "import re\n",
    "import requests\n",
    "import json\n",
    "import io\n",
    "import folium\n",
    "import geopandas as gpd\n",
    "import os\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9741c876",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Author</th>\n",
       "      <th>Full title</th>\n",
       "      <th>In</th>\n",
       "      <th>Year</th>\n",
       "      <th>Place</th>\n",
       "      <th>Publisher/Printer</th>\n",
       "      <th>Era</th>\n",
       "      <th>Form/Genre</th>\n",
       "      <th>Discipline/Content</th>\n",
       "      <th>Original</th>\n",
       "      <th>...</th>\n",
       "      <th>Of interest to</th>\n",
       "      <th>Transkribus text available</th>\n",
       "      <th>Written by</th>\n",
       "      <th>Library and Signature</th>\n",
       "      <th>ids</th>\n",
       "      <th>id</th>\n",
       "      <th>date_min</th>\n",
       "      <th>date_max</th>\n",
       "      <th>filename</th>\n",
       "      <th>file_year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Achrelius, Daniel</td>\n",
       "      <td>Scientiarum magnes recitatus publice anno 1690...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1690</td>\n",
       "      <td>[Turku]</td>\n",
       "      <td>Wall</td>\n",
       "      <td>17th century</td>\n",
       "      <td>Oration</td>\n",
       "      <td>Mathematics, Astronomy/Astrology/Cosmography, ...</td>\n",
       "      <td>Scientiarum magnes(Google Books)</td>\n",
       "      <td>...</td>\n",
       "      <td>MK, JL</td>\n",
       "      <td>Yes</td>\n",
       "      <td>IT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[705665]</td>\n",
       "      <td>705665</td>\n",
       "      <td>1690.0</td>\n",
       "      <td>1690.0</td>\n",
       "      <td>Achrelius,_Daniel_-_Scientiarum_magnes__Turku_...</td>\n",
       "      <td>1690.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Acidalius, Valens</td>\n",
       "      <td>Ad Iordanum Brunum Nolanum, Italum</td>\n",
       "      <td>Poematum Iani Lernutii, Iani Gulielmi, Valenti...</td>\n",
       "      <td>1603</td>\n",
       "      <td>Liegnitz, Wrocław</td>\n",
       "      <td>Albert, David</td>\n",
       "      <td>17th century</td>\n",
       "      <td>Panegyric poem</td>\n",
       "      <td>Astronomy/Astrology/Cosmography</td>\n",
       "      <td>Ad Iordanum Brunum (1603)(CAMENA)Ad Iordanum B...</td>\n",
       "      <td>...</td>\n",
       "      <td>MK, IT</td>\n",
       "      <td>Yes</td>\n",
       "      <td>MK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[801745]</td>\n",
       "      <td>801745</td>\n",
       "      <td>1603.0</td>\n",
       "      <td>1603.0</td>\n",
       "      <td>Janus_Lernutius_et_al__-_Poemata__Liegnitz_160...</td>\n",
       "      <td>1603.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              Author                                         Full title  \\\n",
       "0  Achrelius, Daniel  Scientiarum magnes recitatus publice anno 1690...   \n",
       "1  Acidalius, Valens                 Ad Iordanum Brunum Nolanum, Italum   \n",
       "\n",
       "                                                  In  Year              Place  \\\n",
       "0                                                NaN  1690            [Turku]   \n",
       "1  Poematum Iani Lernutii, Iani Gulielmi, Valenti...  1603  Liegnitz, Wrocław   \n",
       "\n",
       "  Publisher/Printer           Era      Form/Genre  \\\n",
       "0              Wall  17th century         Oration   \n",
       "1     Albert, David  17th century  Panegyric poem   \n",
       "\n",
       "                                  Discipline/Content  \\\n",
       "0  Mathematics, Astronomy/Astrology/Cosmography, ...   \n",
       "1                    Astronomy/Astrology/Cosmography   \n",
       "\n",
       "                                            Original  ... Of interest to  \\\n",
       "0                   Scientiarum magnes(Google Books)  ...         MK, JL   \n",
       "1  Ad Iordanum Brunum (1603)(CAMENA)Ad Iordanum B...  ...         MK, IT   \n",
       "\n",
       "  Transkribus text available Written by Library and Signature       ids  \\\n",
       "0                        Yes         IT                   NaN  [705665]   \n",
       "1                        Yes         MK                   NaN  [801745]   \n",
       "\n",
       "       id date_min date_max  \\\n",
       "0  705665   1690.0   1690.0   \n",
       "1  801745   1603.0   1603.0   \n",
       "\n",
       "                                            filename file_year  \n",
       "0  Achrelius,_Daniel_-_Scientiarum_magnes__Turku_...    1690.0  \n",
       "1  Janus_Lernutius_et_al__-_Poemata__Liegnitz_160...    1603.0  \n",
       "\n",
       "[2 rows x 26 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Phase 1A: Data Exploration\n",
    "# Display 2 sample DataFrame rows\n",
    "noscemus_metadata = pd.read_csv(\"https://raw.githubusercontent.com/CCS-ZCU/noscemus_ETF/refs/heads/master/data/metadata_table_long.csv\")\n",
    "noscemus_metadata.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ca89376f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Columns in noscemus_metadata:\n",
      "['Author', 'Full title', 'In', 'Year', 'Place', 'Publisher/Printer', 'Era', 'Form/Genre', 'Discipline/Content', 'Original', 'Digital sourcebook', 'Description', 'References', 'Cited in', 'How to cite this entry', 'Internal notes', 'Of interest to', 'Transkribus text available', 'Written by', 'Library and Signature', 'ids', 'id', 'date_min', 'date_max', 'filename', 'file_year']\n"
     ]
    }
   ],
   "source": [
    "# Phase 1B: Data Exploration\n",
    "# Display DataFrame Columns\n",
    "\n",
    "print(\"\\nColumns in noscemus_metadata:\")\n",
    "print(noscemus_metadata.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "13a6aa9a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Unique values in 'Place':\n",
      "Place\n",
      "Paris                          69\n",
      "Amsterdam                      49\n",
      "Basel                          48\n",
      "Venice                         48\n",
      "London                         40\n",
      "Leipzig                        36\n",
      "Rome                           34\n",
      "Zurich                         33\n",
      "Leiden                         29\n",
      "Frankfurt am Main              26\n",
      "Göttingen                      25\n",
      "Tübingen                       25\n",
      "Nuremberg                      21\n",
      "Bologna                        21\n",
      "Strasbourg                     20\n",
      "Lyon                           19\n",
      "Wittenberg                     17\n",
      "Innsbruck                      16\n",
      "Cologne                        13\n",
      "Padua                          13\n",
      "Naples                         12\n",
      "Florence                       12\n",
      "Leiden, Stockholm, Erlangen    10\n",
      "Halle                          10\n",
      "Antwerp                        10\n",
      "Oxford                          8\n",
      "Copenhagen                      8\n",
      "Vienna                          8\n",
      "Bern                            7\n",
      "Augsburg                        7\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Number of unique values in 'Place': 173\n",
      "Number of missing values in 'Place': 4\n",
      "\n",
      "Sample raw entries (up to first 20 non-null):\n",
      "['[Turku]', 'Liegnitz, Wrocław', 'Salamanca', 'Heidelberg', 'London', 'Oxford', 'Lund', 'Strasbourg', 'Basel', 'Basel', 'Basel', 'Basel', 'Basel', 'Bologna', 'Leipzig', 'Zurich', 'Venice', 'Rome', 'Herborn', 'Frankfurt am Main']\n"
     ]
    }
   ],
   "source": [
    "# Phase 1C: Data Exploration\n",
    "# Inspect Potential Columns\n",
    "# Replace 'candidate_column_name' with a column name from the list above\n",
    "candidate_column_name = 'Place' # <-- CHANGE THIS VALUE \n",
    "\n",
    "if candidate_column_name in noscemus_metadata.columns:\n",
    "    print(f\"\\nUnique values in '{candidate_column_name}':\")\n",
    "    # Display a sample of unique values and their counts\n",
    "    print(noscemus_metadata[candidate_column_name].value_counts().head(30))\n",
    "    print(f\"\\nNumber of unique values in '{candidate_column_name}': {noscemus_metadata[candidate_column_name].nunique()}\")\n",
    "    print(f\"Number of missing values in '{candidate_column_name}': {noscemus_metadata[candidate_column_name].isnull().sum()}\")\n",
    "    # Show some raw examples of the data in this column\n",
    "    print(\"\\nSample raw entries (up to first 20 non-null):\")\n",
    "    print(noscemus_metadata[candidate_column_name].dropna().head(20).tolist())\n",
    "else:\n",
    "    print(f\"Column '{candidate_column_name}' not found in DataFrame. Please choose from the list printed above.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cline_expand_multi_location_rows",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original number of rows in noscemus_metadata: 975\n",
      "Number of rows after expansion: 1030\n",
      "\n",
      "Sample of expanded_noscemus_metadata (showing some original multi-place entries):\n",
      "       id                          Full title     Place  \\\n",
      "1  801745  Ad Iordanum Brunum Nolanum, Italum  Liegnitz   \n",
      "1  801745  Ad Iordanum Brunum Nolanum, Italum   Wrocław   \n",
      "\n",
      "  Original_Multi_Place_Entry  \n",
      "1          Liegnitz, Wrocław  \n",
      "1          Liegnitz, Wrocław  \n",
      "\n",
      "General head of expanded data:\n",
      "       id                                         Full title       Place  \\\n",
      "0  705665  Scientiarum magnes recitatus publice anno 1690...     [Turku]   \n",
      "1  801745                 Ad Iordanum Brunum Nolanum, Italum    Liegnitz   \n",
      "1  801745                 Ad Iordanum Brunum Nolanum, Italum     Wrocław   \n",
      "2  713323  De natura novi orbis libri duo et De promulgat...   Salamanca   \n",
      "3  693148  Vitae Germanorum medicorum, qui saeculo superi...  Heidelberg   \n",
      "\n",
      "  Original_Multi_Place_Entry  \n",
      "0                        NaN  \n",
      "1          Liegnitz, Wrocław  \n",
      "1          Liegnitz, Wrocław  \n",
      "2                        NaN  \n",
      "3                        NaN  \n"
     ]
    }
   ],
   "source": [
    "# Phase 1 (New Plan): Expand Multi-Location Rows\n",
    "\n",
    "def split_places(place_string):\n",
    "    if pd.isna(place_string) or not isinstance(place_string, str):\n",
    "        return [] # Return empty list for NaN or non-string input\n",
    "    # Split by comma, semicolon, or ampersand. Also handle cases like 'Place1 / Place2'.\n",
    "    # Regex looks for one or more delimiters, surrounded by optional whitespace.\n",
    "    places = re.split(r'\\s*[,;&/]\\s*', place_string)\n",
    "    # Clean up each individual place name: strip whitespace, remove empty strings\n",
    "    return [p.strip() for p in places if p and p.strip()] \n",
    "\n",
    "expanded_rows = []\n",
    "if 'noscemus_metadata' in locals():\n",
    "    print(f\"Original number of rows in noscemus_metadata: {len(noscemus_metadata)}\")\n",
    "    for index, row in noscemus_metadata.iterrows():\n",
    "        original_place_entry = row['Place']\n",
    "        individual_places = split_places(original_place_entry)\n",
    "        \n",
    "        if not individual_places: # Handles NaN, empty strings, or strings that become empty after split\n",
    "            # Keep the row as is, but ensure 'Place' is None or a consistent empty marker if it was NaN/empty\n",
    "            new_row = row.copy()\n",
    "            new_row['Place'] = None # Or np.nan, or an empty string, depending on desired handling for mapping\n",
    "            expanded_rows.append(new_row)\n",
    "        elif len(individual_places) == 1:\n",
    "            # Single place, just copy the row with the cleaned single place name\n",
    "            new_row = row.copy()\n",
    "            new_row['Place'] = individual_places[0]\n",
    "            expanded_rows.append(new_row)\n",
    "        else:\n",
    "            # Multiple places, create a new row for each\n",
    "            for place_name in individual_places:\n",
    "                new_row = row.copy()\n",
    "                new_row['Place'] = place_name\n",
    "                # Add original multi-place string for reference if needed\n",
    "                new_row['Original_Multi_Place_Entry'] = original_place_entry \n",
    "                expanded_rows.append(new_row)\n",
    "    \n",
    "    expanded_noscemus_metadata = pd.DataFrame(expanded_rows)\n",
    "    print(f\"Number of rows after expansion: {len(expanded_noscemus_metadata)}\")\n",
    "\n",
    "    # Display a sample, especially focusing on some known multi-place entries to verify\n",
    "    print(\"\\nSample of expanded_noscemus_metadata (showing some original multi-place entries):\")\n",
    "    # Example: Find rows originating from 'Liegnitz, Wrocław' if it exists\n",
    "    if 'Original_Multi_Place_Entry' in expanded_noscemus_metadata.columns:\n",
    "        sample_multi = expanded_noscemus_metadata[expanded_noscemus_metadata['Original_Multi_Place_Entry'] == 'Liegnitz, Wrocław']\n",
    "        if not sample_multi.empty:\n",
    "            print(sample_multi[['id', 'Full title', 'Place', 'Original_Multi_Place_Entry']].head())\n",
    "        else:\n",
    "            print(\"Could not find 'Liegnitz, Wrocław' in Original_Multi_Place_Entry for sample.\")\n",
    "        # Show general head as well\n",
    "        print(\"\\nGeneral head of expanded data:\")\n",
    "        print(expanded_noscemus_metadata[['id', 'Full title', 'Place', 'Original_Multi_Place_Entry' if 'Original_Multi_Place_Entry' in expanded_noscemus_metadata.columns else 'Place']].head())\n",
    "    else:\n",
    "        print(\"\\nGeneral head of expanded data (Original_Multi_Place_Entry column not created, likely no multi-place entries found):\")\n",
    "        print(expanded_noscemus_metadata[['id', 'Full title', 'Place']].head())\n",
    "else:\n",
    "    print(\"Error: noscemus_metadata DataFrame not found. Please load it first.\")\n",
    "    expanded_noscemus_metadata = pd.DataFrame() # Initialize empty to avoid errors later\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cline_extract_place_column",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 166 unique raw place mentions from 'Place' in the expanded data.\n",
      "Sample of raw places (first 50 from expanded data):\n",
      "['[Turku]' 'Liegnitz' 'Wrocław' 'Salamanca' 'Heidelberg' 'London' 'Oxford'\n",
      " 'Lund' 'Strasbourg' 'Basel' 'Bologna' 'Leipzig' 'Zurich' 'Venice' 'Rome'\n",
      " 'Herborn' 'Frankfurt am Main' 'Turin' 'Florence' 'Alcalá de Henares'\n",
      " 'Leiden' 'Innsbruck' 'Westminster Abbey' 'Paris' 'Cambridge' '[Landshut]'\n",
      " '[Ingolstadt]' 'Milan' 'Bergamo' 'Stuttgart' 'Perugia' 'Lyon' 's.l.'\n",
      " 'Amsterdam' '[Wittenberg]' 'Copenhagen' 'Padua' '[Padua]' 'Rimini'\n",
      " 'Büdingen' 'Königsberg' 'Uppsala' 'Stockholm' 'Turku' 'Desau' 'Würzburg'\n",
      " 'Saint Petersburg' 'Antwerp' 'Graz' 'Aachen']\n"
     ]
    }
   ],
   "source": [
    "# Phase 1: Data Extraction - Extract 'Place' column\n",
    "# Ensure 'expanded_noscemus_metadata' is available and populated from the previous cell.\n",
    "if 'expanded_noscemus_metadata' in locals() and not expanded_noscemus_metadata.empty:\n",
    "    actual_publication_place_column = 'Place' # This is the column with individual place names\n",
    "    places_series = expanded_noscemus_metadata[actual_publication_place_column].astype(str).str.strip()\n",
    "    unique_raw_places = places_series.dropna().unique() # Important to dropna here\n",
    "    print(f\"Found {len(unique_raw_places)} unique raw place mentions from '{actual_publication_place_column}' in the expanded data.\")\n",
    "    print(\"Sample of raw places (first 50 from expanded data):\")\n",
    "    print(unique_raw_places[:50])\n",
    "else:\n",
    "    print(\"Error: expanded_noscemus_metadata is not available or empty. Please ensure the 'Expand Multi-Location Rows' cell ran successfully.\")\n",
    "    # Initialize empty to prevent errors in subsequent cells, or handle appropriately\n",
    "    places_series = pd.Series(dtype=str) \n",
    "    unique_raw_places = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cline_geocode_raw_places",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No cache file found (raw_geocoded_places_cache.csv). Geocoding raw places...\n",
      "Geocoding 166 unique raw place names...\n",
      "Processed 20/166 places...\n",
      "Processed 40/166 places...\n",
      "Processed 60/166 places...\n",
      "Processed 80/166 places...\n",
      "Processed 100/166 places...\n",
      "Processed 120/166 places...\n",
      "Processed 140/166 places...\n",
      "Processed 160/166 places...\n",
      "Saved raw geocoded data to cache: raw_geocoded_places_cache.csv\n",
      "\n",
      "Successfully geocoded 162 places out of 166 unique raw names processed.\n",
      "\n",
      "Sample of geocoded data (first 20 rows):\n",
      "            raw_place       geoname_name  latitude  longitude         country\n",
      "0             [Turku]              Turku  60.45148   22.26869         Finland\n",
      "1            Liegnitz            Legnica  51.21006   16.16190          Poland\n",
      "2             Wrocław            Wroclaw  51.10000   17.03333          Poland\n",
      "3           Salamanca          Salamanca  40.96882   -5.66388           Spain\n",
      "4          Heidelberg         Heidelberg  49.40768    8.69079         Germany\n",
      "5              London             London  51.50853   -0.12574  United Kingdom\n",
      "6              Oxford             Oxford  39.50700  -84.74523   United States\n",
      "7                Lund               Lund  55.70584   13.19321          Sweden\n",
      "8          Strasbourg         Strasbourg  48.58392    7.74553          France\n",
      "9               Basel              Basel  47.55839    7.57327     Switzerland\n",
      "10            Bologna            Bologna  44.49381   11.33875           Italy\n",
      "11            Leipzig            Leipzig  51.33962   12.37129         Germany\n",
      "12             Zurich             Zurich  47.36667    8.55000     Switzerland\n",
      "13             Venice             Venice  45.43713   12.33265           Italy\n",
      "14               Rome               Rome  41.89193   12.51133           Italy\n",
      "15            Herborn            Herborn  49.74167    6.42778      Luxembourg\n",
      "16  Frankfurt am Main  Frankfurt am Main  50.11552    8.68417         Germany\n",
      "17              Turin              Turin  45.07049    7.68682           Italy\n",
      "18           Florence           Florence  43.77925   11.24626           Italy\n",
      "19  Alcalá de Henares  Alcalá de Henares  40.48205   -3.35996           Spain\n",
      "\n",
      "Places that were NOT found by Geonames (sample):\n",
      "['not indicated' 'Philadelphia (fictive)' 'Venice [Modena]'\n",
      " 'Neostadii in Palatinate (Neustadt an der Weinstraße)']\n",
      "Total unique raw places not found: 4\n"
     ]
    }
   ],
   "source": [
    "# Phase 2: Geocode Raw Publication Places\n",
    "\n",
    "GEONAMES_USERNAME = \"utaysi\"  # Your Geonames username\n",
    "raw_geocoded_cache_file = 'raw_geocoded_places_cache.csv'\n",
    "\n",
    "def get_coordinates(place_name, username):\n",
    "    if not place_name or pd.isna(place_name):\n",
    "        return None, None, None, None\n",
    "    # Ensure place_name is a string for requests.utils.quote\n",
    "    place_name_str = str(place_name)\n",
    "    try:\n",
    "        # Initial attempt: prioritize populated places (featureClass=P)\n",
    "        url = f\"http://api.geonames.org/searchJSON?q={requests.utils.quote(place_name_str)}&maxRows=1&featureClass=P&username={username}\"\n",
    "        response = requests.get(url, timeout=15)\n",
    "        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)\n",
    "        data = response.json()\n",
    "        if data.get('geonames') and len(data['geonames']) > 0:\n",
    "            top_result = data['geonames'][0]\n",
    "            return float(top_result['lat']), float(top_result['lng']), top_result.get('name'), top_result.get('countryName')\n",
    "        else:\n",
    "            # Fallback: search without featureClass if no populated place found or if initial result is empty\n",
    "            # This helps with broader terms or historical names that might not be classed as 'P'\n",
    "            url_fallback = f\"http://api.geonames.org/searchJSON?q={requests.utils.quote(place_name_str)}&maxRows=1&username={username}\"\n",
    "            # print(f\"Retrying without featureClass for: {place_name_str}\") # Optional: for debugging\n",
    "            response_fallback = requests.get(url_fallback, timeout=15)\n",
    "            response_fallback.raise_for_status()\n",
    "            data_fallback = response_fallback.json()\n",
    "            if data_fallback.get('geonames') and len(data_fallback['geonames']) > 0:\n",
    "                top_result_fallback = data_fallback['geonames'][0]\n",
    "                # print(f\"Fallback success for {place_name_str}: Found {top_result_fallback.get('name')}\") # Optional\n",
    "                return float(top_result_fallback['lat']), float(top_result_fallback['lng']), top_result_fallback.get('name'), top_result_fallback.get('countryName')\n",
    "            # print(f\"Place not found by Geonames (even after fallback): {place_name_str}\") # Optional\n",
    "            return None, None, None, None\n",
    "    except requests.exceptions.Timeout:\n",
    "        print(f\"API request timed out for {place_name_str}\")\n",
    "        return None, None, None, None\n",
    "    except requests.exceptions.HTTPError as http_err:\n",
    "        print(f\"HTTP error occurred for {place_name_str}: {http_err} - Response: {response.text[:200]}...\")\n",
    "        return None, None, None, None\n",
    "    except requests.exceptions.RequestException as req_err:\n",
    "        print(f\"API request failed for {place_name_str}: {req_err}\")\n",
    "        return None, None, None, None\n",
    "    except ValueError as json_err: # Handles JSON decoding errors\n",
    "        print(f\"JSON decoding failed for {place_name_str} (response: {response.text[:200]}...): {json_err}\")\n",
    "        return None, None, None, None\n",
    "\n",
    "# Check for cached data first\n",
    "if os.path.exists(raw_geocoded_cache_file):\n",
    "    print(f\"Loading raw geocoded data from cache: {raw_geocoded_cache_file}\")\n",
    "    raw_geocoded_df = pd.read_csv(raw_geocoded_cache_file)\n",
    "    # Ensure all expected columns are present, fill with NA if not\n",
    "    expected_cols = ['raw_place', 'geoname_name', 'latitude', 'longitude', 'country']\n",
    "    for col in expected_cols:\n",
    "        if col not in raw_geocoded_df.columns:\n",
    "            raw_geocoded_df[col] = pd.NA\n",
    "else:\n",
    "    print(f\"No cache file found ({raw_geocoded_cache_file}). Geocoding raw places...\")\n",
    "    raw_geocoded_data = []\n",
    "    if 'places_series' in locals():\n",
    "        unique_raw_places = places_series.dropna().unique() # Use dropna() before unique()\n",
    "        print(f\"Geocoding {len(unique_raw_places)} unique raw place names...\")\n",
    "        for i, place in enumerate(unique_raw_places):\n",
    "            if str(place).strip() == \"nan\" or str(place).strip() == \"\": # Skip if place is 'nan' string or empty after strip\n",
    "                # print(f\"Skipping invalid place entry: '{place}'\") # Optional\n",
    "                lat, lon, geoname_name, country = None, None, None, None\n",
    "            else:\n",
    "                if (i+1) % 20 == 0:\n",
    "                    print(f\"Processed {i+1}/{len(unique_raw_places)} places...\")\n",
    "                lat, lon, geoname_name, country = get_coordinates(place, GEONAMES_USERNAME)\n",
    "            \n",
    "            raw_geocoded_data.append({'raw_place': place, \n",
    "                                      'geoname_name': geoname_name, \n",
    "                                      'latitude': lat, \n",
    "                                      'longitude': lon, \n",
    "                                      'country': country})\n",
    "            time.sleep(0.1) # 100ms delay to be respectful to the API\n",
    "\n",
    "        raw_geocoded_df = pd.DataFrame(raw_geocoded_data)\n",
    "        raw_geocoded_df.to_csv(raw_geocoded_cache_file, index=False)\n",
    "        print(f\"Saved raw geocoded data to cache: {raw_geocoded_cache_file}\")\n",
    "    else:\n",
    "        print(\"Error: 'places_series' not defined. Please ensure the previous cells (especially 'cline_extract_place_column') have been run.\")\n",
    "        raw_geocoded_df = pd.DataFrame(columns=['raw_place', 'geoname_name', 'latitude', 'longitude', 'country']) # Create empty df\n",
    "\n",
    "if not raw_geocoded_df.empty:\n",
    "    print(f\"\\nSuccessfully geocoded {raw_geocoded_df['latitude'].notna().sum()} places out of {len(raw_geocoded_df)} unique raw names processed.\")\n",
    "    print(\"\\nSample of geocoded data (first 20 rows):\")\n",
    "    print(raw_geocoded_df.head(20))\n",
    "    \n",
    "    print(\"\\nPlaces that were NOT found by Geonames (sample):\")\n",
    "    not_found_sample = raw_geocoded_df[raw_geocoded_df['latitude'].isna()]['raw_place'].unique()\n",
    "    print(not_found_sample[:20]) # Show up to 20 unique not found raw places\n",
    "    print(f\"Total unique raw places not found: {len(not_found_sample)}\")\n",
    "else:\n",
    "    print(\"\\nraw_geocoded_df is empty. Check for errors in previous steps or API calls.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6002ab86",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
