In [None]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
from datetime import datetime
import concurrent.futures
import os
from tqdm import tqdm
import json
from functools import lru_cache
import openpyxl
from requests.packages.urllib3.util.retry import Retry

In [None]:
def create_requests_session(retries=3):
    session = requests.Session()
    retry_strategy = Retry(
        total=retries,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

# Cache results to avoid redundant API calls for similar coordinates
@lru_cache(maxsize=1000)
def get_location_info_cached(lat, lon, username, session):
    """
    Cached version of location lookup - rounds coordinates to reduce redundant calls
    """
    # Round to 2 decimal places (about 1km precision) to increase cache hits
    lat_rounded = round(float(lat), 2)
    lon_rounded = round(float(lon), 2)
    
    return get_location_info(lat_rounded, lon_rounded, username, session)

def get_location_info(lat, lon, username, session):
    """
    Fetch location information from GeoNames API
    Special handling for England and Scotland to use adminName2 instead of adminName1
    """
    base_url = "http://api.geonames.org/findNearbyJSON"
    params = {
        'lat': lat,
        'lng': lon,
        'username': username,
        'style': 'FULL',
        'maxRows': 1
    }
    
    try:
        response = session.get(base_url, params=params, timeout=10)
        response.raise_for_status()
        
        data = response.json()
        if 'geonames' in data and data['geonames']:
            location = data['geonames'][0]
            
            # Check if the country is England or Scotland
            country_name = location.get('countryName', '')
            if country_name in ['England', 'Scotland']:
                # For England and Scotland, use adminName2 (county/council area)
                return location.get('adminName2', 'Unknown')
            else:
                # For all other countries, use adminName1 (state/province)
                return location.get('adminName1', 'Unknown')
        return 'Not Found'
            
    except Exception as e:
        return f'Error: {str(e)}'

def process_locations(input_file, username, batch_size=1000):
    """
    Process locations with missing State/Province/Territory data
    
    Args:
        input_file (str): Path to input CSV file
        username (str): GeoNames API username
        batch_size (int): Number of records to process per hour
    """
    # Load the data
    print(f"Loading data from {input_file}")
    df = pd.read_excel(input_file)
    
    # Create backup of original file
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_file = f"{input_file.rsplit('.', 1)[0]}_backup_{timestamp}.csv"
    df.to_csv(backup_file, index=False)
    print(f"Created backup at {backup_file}")
    
    # Add new column for the API results
    if 'state test' not in df.columns:
        df['state test'] = None
    
    # Filter rows with empty state/province
    mask = df['State/Province/Territory'].isna() | (df['State/Province/Territory'] == '')
    rows_to_process = df[mask].copy()
    
    # Skip rows with missing coordinates
    rows_to_process = rows_to_process.dropna(subset=['latitude', 'longitude'])
    
    total_rows = len(rows_to_process)
    if total_rows == 0:
        print("No rows to process")
        return df
    
    print(f"Found {total_rows} rows with missing State/Province/Territory")
    
    # Create a persistent session for all requests
    session = create_requests_session()
    
    # Process in batches with rate limiting
    processed = 0
    batch_start_time = time.time()
    
    # Create progress bar
    with tqdm(total=total_rows, desc="Processing locations", unit="loc") as pbar:
        for idx, row in rows_to_process.iterrows():
            if processed > 0 and processed % batch_size == 0:
                # Wait if we've hit the batch limit
                elapsed = time.time() - batch_start_time
                if elapsed < 3600:  # If less than an hour has passed
                    sleep_time = 3600 - elapsed
                    pbar.write(f"Completed batch of {batch_size}. Waiting {sleep_time/60:.1f} minutes before next batch...")
                    time.sleep(sleep_time)
                batch_start_time = time.time()
                
                # Save progress after each batch
                progress_file = f"{input_file.rsplit('.', 1)[0]}_progress_{timestamp}.csv"
                df.to_csv(progress_file, index=False)
                pbar.write(f"Saved progress at {processed}/{total_rows} rows")
            
            # Get admin region from GeoNames
            admin_name = get_location_info_cached(row['latitude'], row['longitude'], username, session)
            
            # Update the dataframe with the result
            df.at[idx, 'state test'] = admin_name
            
            processed += 1
            pbar.update(1)
            
            # Small delay between API calls
            time.sleep(0.2)
    
    # Save final results
    output_file = f"{input_file.rsplit('.', 1)[0]}_completed_{timestamp}.xlsx"
    df.to_excel(output_file, index=False)
    print(f"\nProcessing completed. Results saved to {output_file}")
    
    # Cache statistics
    print(f"Cache info: {get_location_info_cached.cache_info()}")
    
    return df

if __name__ == "__main__":
    # Configuration
    INPUT_FILE = '/Users/yubinbaaniya/Downloads/gauge_review_with_duplicate_and_main.xlsx'
    GEONAMES_USERNAME = "ybaaniya"  # Your GeoNames username
    BATCH_SIZE = 900  #API allows only 1000 request per hour
    
    try:
        # Process the file
        result_df = process_locations(INPUT_FILE, GEONAMES_USERNAME, BATCH_SIZE)
        print("Processing completed successfully")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")