For a large, messy dataframe such as in Bank Transactions.csv, one can also make use of OpenMaps or Google 
Geocoding API to clean the CustLocation column. Given below are both their executions.

## Google Maps Geocoding
(Most accurate, but requires an API key and billing)

In [None]:
import time
import requests
import pandas as pd
import re
import json
from pathlib import Path

api_key = "xyz_api_key" ## Replace with the Google maps API key generated for you
input_docx = "/mnt/data/Todcheck.docx" # or path to input CSV with one column 'location'
cache_file = "geocode_cache.json"
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
rate_sleep = 0.05 # Should be adjusted accordinly if quota is hit

def load_docx_lines(path):
    from docx import Document
    doc = Document(path)
    lines = []
    for p in doc.paragraphs:
        txt = p.text.strip()
        if txt:
            # removing leading numbering like "1. "
            txt = re.sub(r'^\d+\.\s*', '', txt)
            lines.append(txt)
    return lines

def normalize(s):
    if s is None:
        return ""
    s = re.sub(r'\s+', ' ', str(s).strip())
    return s

def load_cache(fn):
    if Path(fn).exists():
        return json.loads(Path(fn).read_text(encoding="utf8"))
    return {}

def save_cache(fn, data):
    Path(fn).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf8")

def google_geocode(address, api_key):
    # call Places/Geocoding endpoint
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {"address": address, "key": api_key, "region": "IN"}  # hint for region
    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    return resp.json()

def get_country_from_result(geo_json):
    
    # extraction of country and ISO
    if not geo_json:
        return None, None
    results = geo_json.get("results", [])
    if not results:
        return None, None
        
    # using best (first) result
    comp = results[0].get("address_components", [])
    country = None
    country_code = None
    for c in comp:
        if "country" in c.get("types", []):
            country = c.get("long_name")
            country_code = c.get("short_name")
            break
    return country, country_code

def classify_locations(locations):
    cache = load_cache(cache_file)
    out = []
    for loc in locations:
        loc_norm = normalize(loc)
        if loc_norm in cache:
            geores = cache[loc_norm]
        else:
            try:
                j = google_geocode(loc_norm, api_key)
            except Exception as e:
                print("Request error for:", loc_norm, e)
                j = None
            cache[loc_norm] = j
            save_cache(cache_file, cache)
            time.sleep(rate_sleep)
        country, country_code = get_country_from_result(cache[loc_norm])
        out.append({
            "raw": loc,
            "normalized": loc_norm,
            "country": country,
            "country_code": country_code,
            "geocode_json": cache[loc_norm]
        })
    return cache, pd.DataFrame(out)

if __name__ == "__main__":
    locs = load_docx_lines(input_docx)
    cache, df = classify_locations(locs)

    df['is_india'] = df['country_code'].fillna('').str.upper() == 'IN'
    
    # fallback if address contains 'India'
    df.loc[~df['is_india'], 'is_india'] = df.loc[~df['is_india'], 'geocode_json'].apply(
        lambda j: bool(j and any('India' in r.get('formatted_address','') for r in (j.get('results') if j else [])))
    )

    domestic_df = df[df['is_india']].copy()
    foreign_df = df[~df['is_india']].copy()
    unknown_df = df[df['country_code'].isnull() & df['geocode_json'].notnull()].copy()  # geocoded but no country

    domestic_df.to_csv(RESULTS_DIR/"domestic_df.csv", index=False)
    foreign_df.to_csv(RESULTS_DIR/"foreign_df.csv", index=False)
    unknown_df.to_csv(RESULTS_DIR/"unknown_df.csv", index=False)

    print("Results in", RESULTS_DIR.absolute())
    print("Domestic:", len(domestic_df), "Foreign:", len(foreign_df), "Unknown/NA:", len(ambiguous_df))

Notes:

1. Put key into api_key. Geocoding API may need to be enabled in Google Cloud with billing.
2. Use cache — geocode_cache.json — to avoid re-querying same names if re-running the script.
3. If quota is hit or billing limits, reduce rate_sleep or perform in small batches.

## Nominatim (OpenStreetMap) via geopy
(No API key, polite usage)

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

## Variable names are capitalized to prevent overwriting variables written in the previous cell for Google Maps API call
INPUT_DOCX = "/mnt/data/Todcheck.docx" 
CACHE_FILE = "geocode_cache_nominatim.json"
RESULTS_DIR = Path("results_nominatim"); RESULTS_DIR.mkdir(exist_ok=True)

def load_docx_lines(path):
    from docx import Document
    doc = Document(path)
    lines = []
    for p in doc.paragraphs:
        txt = p.text.strip()
        if txt:
            txt = re.sub(r'^\d+\.\s*', '', txt)
            lines.append(txt)
    return lines

def normalize(s):
    if s is None:
        return ""
    s = re.sub(r'\s+', ' ', str(s).strip())
    return s

def load_cache(fn):
    if Path(fn).exists():
        return json.loads(Path(fn).read_text(encoding="utf8"))
    return {}

def save_cache(fn, data):
    Path(fn).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf8")

def classify_locations_nominatim(locs):
    geolocator = Nominatim(user_agent="loc-classifier-1.0")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=2, error_wait_seconds=5)
    cache = load_cache(CACHE_FILE)
    rows = []
    for loc in locs:
        loc_norm = normalize(loc)
        if not loc_norm:
            rows.append({"raw":loc, "normalized":loc_norm, "country":None, "country_code":None, "raw_geocode":None})
            continue
        if loc_norm in cache:
            res = cache[loc_norm]
        else:
            try:
                place = geocode(loc_norm, addressdetails=True, exactly_one=True, timeout=20)
            except Exception as e:
                print("Error geocoding", loc_norm, e)
                place = None
            if place:
                cache[loc_norm] = {"raw": place.raw}
            else:
                cache[loc_norm] = None
            save_cache(CACHE_FILE, cache)
            
            time.sleep(0.1)
            res = cache[loc_norm]
        country = None
        country_code = None
        if res and 'raw' in res:
            addr = res['raw'].get('address', {})
            country = addr.get('country')
            country_code = addr.get('country_code', '').upper() if addr.get('country_code') else None
        rows.append({"raw": loc, "normalized": loc_norm, "country": country, "country_code": country_code, "raw_geocode": res})
    return pd.DataFrame(rows), cache

if __name__ == "__main__":
    locs = load_docx_lines(INPUT_DOCX)
    df2, cache = classify_locations_nominatim(locs)
    
    df2['is_india'] = df2['country_code'] == 'IN'
    df2.loc[~df['is_india'], 'is_india'] = df2.loc[~df2['is_india'], 'country'].fillna('').str.contains('India', case=False)
    
    domestic = df2[df2['is_india']].copy()
    foreign = df2[~df2['is_india']].copy()
    unknown = df2[df2['country'].isnull()].copy() # geocoded but no country
    
    domestic.to_csv(RESULTS_DIR/"domestic_df2.csv", index=False)
    foreign.to_csv(RESULTS_DIR/"foreign_df2.csv", index=False)
    unknown.to_csv(RESULTS_DIR/"unknown.csv", index=False)
    print("Domestic:", len(domestic), "Foreign:", len(foreign), "Unknown:", len(ambiguous))

Notes:

1. At most 1 request per second for bulk. Use RateLimiter min_delay_seconds=1.
2. For over 1000+ unique rows for locations, this will take 1+ hours due to rate limit. Use caching to resume.
3. Accuracy may vary for ambiguous short names (e.g., "Springfield" — many countries have that) -> These cases require manual checks.