In [1]:

import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

%pip install coordinate_parser
from coordinate_parser import parse_coordinate
import requests
from bs4 import BeautifulSoup
import time
from rapidfuzz import process,fuzz


# ---- Reading file ----
csv_path = 'city_data.csv'
data = pd.read_csv(csv_path, delimiter='|')

# ---- Replacing the columns with first row ----
new_header = data.iloc[0]
data = data[1:]
data.columns = new_header

# ---- Split the City column into City + Country ----
cities = []
states = []
for i in range(len(data)):
    if ',' in data.iloc[i]["City"]:
        city_and_state = data.iloc[i]["City"].split(",")
    elif '.' in data.iloc[i]["City"]:
        city_and_state = data.iloc[i]["City"].split(".")
    else:
        city_and_state = data.iloc[i]["City"].split(";")
         
    cities.append(city_and_state[0].strip())
    if len(city_and_state) > 1:
        states.append(city_and_state[1].strip())
    else:
        states.append("")

data.drop(columns=['City'], inplace=True)
data.insert(0, 'City', cities)
data.insert(1, 'Country', states)

# ---- Remove column with too many missing values ----
data.drop(columns=['Average Price Groceries'], inplace=True)

# ---- Convert numeric columns from object -> float ----
for col in data.columns:
    try:
        data[col] = pd.to_numeric(data[col])
    except:
        pass  # keep text columns (City, Country) as is

# ---- Apply KNN Imputation on numeric columns ----
numeric_cols = data.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
scaled = scaler.fit_transform(data[numeric_cols])

imputer = KNNImputer(n_neighbors=7)
imputed_scaled = imputer.fit_transform(scaled)

# Inverse scale back
imputed = scaler.inverse_transform(imputed_scaled)

# Put back into DataFrame
data[numeric_cols] = imputed

#removing duplicate rows
data.drop_duplicates(inplace=True)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

data.loc[data["City"]=="Greece","Country"] = "Greece"
data.loc[data["City"]=="Greece","City"] = "Athens"








Note: you may need to restart the kernel to use updated packages.


In [6]:

dict_countries = {}
country1 = []
city1 = []
lat = []
long = []




session = requests.Session()

session.headers.update({
    "user-Agent": "my-app/0.0.1"
})


url = 'https://en.wikipedia.org/wiki/Main_Page'    #Parsing wikipedia's main page
response = session.get(url)
soup = BeautifulSoup(response.text, "lxml")
tag = soup.find("a",string= "Contents")               #Looking for the contents tab


time.sleep(1)

url2 = f"https://en.wikipedia.org{tag["href"]}"     #Parsing the contents tab
response2 = session.get(url2)
soup2 = BeautifulSoup(response2.text,"lxml")
tag2 = soup2.find("a",string = "Geography")       #Looking for the Geography page hyperlink



time.sleep(1)
url3 = f"https://en.wikipedia.org{tag2["href"]}"      #Parsing Geography 
response3 = session.get(url3)
soup3 = BeautifulSoup(response3.text,"lxml")             
tag3 = soup3.find(lambda tag: tag.find("b",string="Europe") in [child for child in tag.children] and tag.name =="dd")


choices = [tag.get_text(strip=True) for tag in tag3.find_all("a")]        
for country in data["Country"].unique():
    dict_countries[country] = process.extractOne(country,choices)[0] 
       #Fixing dataframe's country name missmatches with wikipedia (Czechia != Czech Republic)



for country in data["Country"].unique():                                                            #In this step we to go each country's page and try to look for it's cities list
    url4 = f"https://en.wikipedia.org{soup3.find("a",string=dict_countries[country])["href"]}"         
    response4 = session.get(url4)
    soup4 = BeautifulSoup(response4.text,"lxml")
    
    try:
        url5 = f"https://en.wikipedia.org{soup4.find("a",string=lambda s:s and "Cities" in s)["href"]}"           #Some countries have different ways to list their cities
    except TypeError:
        try:
            url5 = f"https://en.wikipedia.org{soup4.find("a",title=lambda s:s and "List of metropolitan areas" in s)["href"]}"
        except TypeError:
            url5 = f"https://en.wikipedia.org{soup4.find("a",title=lambda s:s and "List of cities" in s)["href"]}"


    response5 = session.get(url5)
    soup5 = BeautifulSoup(response5.text,"lxml")
    

    
    fuzzy = [tag.get_text(strip=True) for tag in soup5.find_all("a")]
    city_dict = {}
    for city in data[data["Country"]== country]["City"]:
        fuzzy_scoring = process.extractOne(city,fuzzy,scorer = fuzz.ratio)
        city_dict[city] = fuzzy_scoring[0]    #Fixing dataframe's city name missmatches with wikipedia (Gent != Ghent)

        if city_dict[city] != city or fuzzy_scoring[1]<70:
            print(f"{city_dict[city]} : {city} , {fuzzy_scoring[1]}")

        if fuzzy_scoring[1] > 70:
            url6 = f"https://en.wikipedia.org{soup5.find("a",string=city_dict[city])["href"]}"
            response6 = session.get(url6)
            soup6 = BeautifulSoup(response6.text,"lxml")

            
            try:
                lat.append(soup6.find("span" , class_ = "latitude").get_text())
            except:
                lat.append(np.nan)
            try:
                long.append(soup6.find("span" , class_ = "longitude").get_text())
            except:
                long.append(np.nan)
            country1.append(country)
            city1.append(city)
        else:
            lat.append(np.nan)
            long.append(np.nan)
            country1.append(country)
            city1.append(city)
            

coords = pd.DataFrame({"Country":country1,"City":city1,"Latitude":lat,"Longitude":long})
unreached_cities = coords[(coords["Latitude"].isna()) | (coords["Longitude"].isna())] 

#Using search bar for unreached cities or cities with poor fuzzy matching scores

if len(unreached_cities)>0:
    search_bar = soup6.find("form",id="searchform")["action"]
    for city in unreached_cities["City"]:
        url7 = f"https://en.wikipedia.org{search_bar}"
        params = {"search": f"{city}"}

        r = session.get(url7, params=params)
        soup7 = BeautifulSoup(r.text,"lxml")
        
        print(soup7.find("title"), city)

        lat_city = soup7.find("span", class_="latitude")
        long_city = soup7.find("span", class_="longitude")

        coords.loc[coords["City"] == city, "Latitude"] = lat_city.get_text() if lat_city else np.nan
        coords.loc[coords["City"] == city, "Longitude"] = long_city.get_text() if long_city else np.nan

Ghent : Gent , 88.88888888888889
Lefkoşa : Lefkosia , 80.0
Kontemenos : Lemesos , 58.82352941176471
Düsseldorf : Dusseldorf , 90.0
Málaga : Malaga , 83.33333333333334
Lubowidz : Lodz , 66.66666666666667
Chrzanów : Cracow , 57.14285714285714
Greece : Giroc , 54.54545454545454
Malmö : Malmo , 80.0
<title>Limassol - Wikipedia</title> Lemesos
<title>Madrid - Wikipedia</title> Madrid
<title>Łódź - Wikipedia</title> Lodz
<title>Kraków - Wikipedia</title> Cracow
<title>Giroc - Wikipedia</title> Giroc


In [3]:
search_bar = soup6.find("form",id="searchform")["action"]
for city in unreached_cities["City"]:
    url7 = f"https://en.wikipedia.org{search_bar}"
    params = {"search": f"{city}"}

    r = session.get(url7, params=params)
    soup7 = BeautifulSoup(r.text,"lxml")
        
    print(soup7.find("title"))

    lat_city = soup7.find("span", class_="latitude")
    long_city = soup7.find("span", class_="longitude")

    coords.loc[coords["City"] == city, "Latitude"] = lat_city.get_text() if lat_city else np.nan
    coords.loc[coords["City"] == city, "Longitude"] = long_city.get_text() if long_city else np.nan

<title>Limassol - Wikipedia</title>
<title>Madrid - Wikipedia</title>
<title>Łódź - Wikipedia</title>
<title>Kraków - Wikipedia</title>
<title>Giroc - Wikipedia</title>


In [4]:
coords[(coords["Latitude"].isna())|(coords["Longitude"].isna())]


Unnamed: 0,Country,City,Latitude,Longitude
27,Spain,Madrid,,


In [5]:
coords.loc[:,"Latitude"] = coords["Latitude"].apply(lambda x : parse_coordinate(x))
coords.loc[:,"Longitude"] = coords["Longitude"].apply(lambda x : parse_coordinate(x))

ValueError: Coordinate nan is outside reasonable range [-180, 180]

In [None]:
data = data.merge(coords,on=["Country","City"],how="inner")

In [None]:



map = px.scatter_geo(
    data,
    lat="Latitude",
    lon="Longitude",
    hover_name="City",
    hover_data=["Country", "Population"],           # Big title on hover  
    scope="europe",
    color="Country"
    
)
map.update_layout(
    geo = {
    "showframe": False,        # remove outer frame
    "showcoastlines": True,    # draw coastlines
    "coastlinecolor": "black", # color of coastlines
    "showcountries": True,     # draw country borders
    "countrycolor": "black",   # color of country borders
    "showsubunits": True,      # draw internal subdivisions (states, provinces)
    "subunitcolor": "gray"     # color of subdivisions
}
)



# Hide coordinates in the hover (they appear by default)
map.update_traces(hovertemplate="<b>%{hovertext}</b><br>%{customdata[0]}<br>Population: %{customdata[1]}<extra></extra>")

map.show()

In [None]:
coords

Unnamed: 0,Country,City,Latitude,Longitude
0,Austria,Vienna,48°12′30″N,16°22′21″E
1,Austria,Salzburg,47°48′00″N,13°02′42″E
2,Belgium,Brussels,50°50′48″N,04°21′09″E
3,Belgium,Antwerp,51°13′04″N,04°24′01″E
4,Belgium,Gent,51°03′13″N,03°43′31″E
5,Belgium,Bruges,51°12′N,3°12′E
6,Bulgaria,Sofia,42°42′N,23°20′E
7,Bulgaria,Dobrich,43°34′N,27°50′E
8,Switzerland,Zurich,47°22′N,8°33′E
9,Switzerland,Geneva,46°12′06″N,06°08′49″E
