In [None]:

import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

%pip install coordinate_parser
from coordinate_parser import parse_coordinate
import requests
from bs4 import BeautifulSoup
import time
from rapidfuzz import process,fuzz


# ---- Reading file ----
csv_path = 'city_data.csv'
data = pd.read_csv(csv_path, delimiter='|')

# ---- Replacing the columns with first row ----
new_header = data.iloc[0]
data = data[1:]
data.columns = new_header

# ---- Split the City column into City + Country ----
cities = []
states = []
for i in range(len(data)):
    if ',' in data.iloc[i]["City"]:
        city_and_state = data.iloc[i]["City"].split(",")
    elif '.' in data.iloc[i]["City"]:
        city_and_state = data.iloc[i]["City"].split(".")
    else:
        city_and_state = data.iloc[i]["City"].split(";")
         
    cities.append(city_and_state[0].strip())
    if len(city_and_state) > 1:
        states.append(city_and_state[1].strip())
    else:
        states.append("")

data.drop(columns=['City'], inplace=True)
data.insert(0, 'City', cities)
data.insert(1, 'Country', states)

# ---- Remove column with too many missing values ----
data.drop(columns=['Average Price Groceries'], inplace=True)

# ---- Convert numeric columns from object -> float ----
for col in data.columns:
    try:
        data[col] = pd.to_numeric(data[col])
    except:
        pass  # keep text columns (City, Country) as is

# ---- Apply KNN Imputation on numeric columns ----
numeric_cols = data.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
scaled = scaler.fit_transform(data[numeric_cols])

imputer = KNNImputer(n_neighbors=7)
imputed_scaled = imputer.fit_transform(scaled)

# Inverse scale back
imputed = scaler.inverse_transform(imputed_scaled)

# Put back into DataFrame
data[numeric_cols] = imputed

#removing duplicate rows
data.drop_duplicates(inplace=True)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

data.loc[data["City"]=="Greece","Country"] = "Greece"
data.loc[data["City"]=="Greece","City"] = "Athens"








Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:

dict_countries = {}
country1 = []
city1 = []
lat = []
long = []



headers = {'user-agent': 'my-app/0.0.1'}


url = 'https://en.wikipedia.org/wiki/Main_Page'    #Parsing wikipedia's main page
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
tag = soup.find("a",string= "Contents")               #Looking for the contents tab


time.sleep(1)

url2 = f"https://en.wikipedia.org{tag["href"]}"     #Parsing the contents tab
response2 = requests.get(url2,headers=headers)
soup2 = BeautifulSoup(response2.text,"lxml")
tag2 = soup2.find("a",string = "Geography")       #Looking for the Geography page hyperlink



time.sleep(1)
url3 = f"https://en.wikipedia.org{tag2["href"]}"      #Parsing Geography 
response3 = requests.get(url3,headers=headers)
soup3 = BeautifulSoup(response3.text,"lxml")             


tag3 = soup3.find("b",string="Europe")
choices = [tag.get_text(strip=True) for tag in soup3.find_all("a")]        
for country in data["Country"].unique():
    dict_countries[country] = process.extractOne(country,choices)[0]    #Fixing dataframe's country name missmatches with wikipedia (Czechia != Czech Republic)



for country in data["Country"].unique():                                                            #In this step we to go each country's page and try to look for it's cities list
    url4 = f"https://en.wikipedia.org{soup3.find("a",string=dict_countries[country])["href"]}"         
    response4 = requests.get(url4,headers=headers)
    soup4 = BeautifulSoup(response4.text,"lxml")
    
    try:
        url5 = f"https://en.wikipedia.org{soup4.find("a",string=lambda s:s and "Cities" in s)["href"]}"           #Some countries have different ways to list their cities
    except TypeError:
        try:
            url5 = f"https://en.wikipedia.org{soup4.find("a",title=lambda s:s and "List of metropolitan areas" in s)["href"]}"
        except TypeError:
            url5 = f"https://en.wikipedia.org{soup4.find("a",title=lambda s:s and "List of cities" in s)["href"]}"


    response5 = requests.get(url5,headers=headers)
    soup5 = BeautifulSoup(response5.text,"lxml")
    

    
    fuzzy = [tag.get_text(strip=True) for tag in soup5.find_all("a")]
    city_dict = {}
    for city in data[data["Country"]== country]["City"]:
        city_dict[city] = process.extractOne(city,fuzzy,scorer = fuzz.ratio)[0]    #Fixing dataframe's city name missmatches with wikipedia (Gent != Ghent)

        url6 = f"https://en.wikipedia.org{soup5.find("a",string=city_dict[city])["href"]}"
        response6 = requests.get(url6,headers=headers)
        soup6 = BeautifulSoup(response6.text,"lxml")

        
        try:
            lat.append(soup6.find("span" , class_ = "latitude").get_text())
        except:
            lat.append(np.nan)
        try:
            long.append(soup6.find("span" , class_ = "longitude").get_text())
        except:
            long.append(np.nan)
        country1.append(country)
        city1.append(city)

coords = pd.DataFrame({"Country":country1,"City":city1,"Latitude":lat,"Longitude":long})

In [286]:
coords[(coords["Latitude"].isna())|(coords["Longitude"].isna())]


Unnamed: 0,Country,City,Latitude,Longitude
76,Romania,Giroc,,


In [287]:
#Manual imputation for Giroc

coords.loc[76,"Latitude"] = "45°42′N" 
coords.loc[76,"Longitude"] = "21°14′E"

In [288]:
coords.loc[:,"Latitude"] = coords["Latitude"].apply(lambda x : parse_coordinate(x))
coords.loc[:,"Longitude"] = coords["Longitude"].apply(lambda x : parse_coordinate(x))

In [289]:
data = data.merge(coords,on=["Country","City"],how="inner")

In [290]:



map = px.scatter_geo(
    data,
    lat="Latitude",
    lon="Longitude",
    hover_name="City",
    hover_data=["Country", "Population"],           # Big title on hover  
    scope="europe",
    color="Country"
    
)
map.update_layout(
    geo = {
    "showframe": False,        # remove outer frame
    "showcoastlines": True,    # draw coastlines
    "coastlinecolor": "black", # color of coastlines
    "showcountries": True,     # draw country borders
    "countrycolor": "black",   # color of country borders
    "showsubunits": True,      # draw internal subdivisions (states, provinces)
    "subunitcolor": "gray"     # color of subdivisions
}
)



# Hide coordinates in the hover (they appear by default)
map.update_traces(hovertemplate="<b>%{hovertext}</b><br>%{customdata[0]}<br>Population: %{customdata[1]}<extra></extra>")

map.show()