The Open Buildings data set (https://sites.research.google/gr/open-buildings) contains building outlines derived from satellite imagery. I used this dataset as well as the attached file indicating the locations of capital cities for three countries,
(a) Determine the number of buildings within 100 kilometers of the capital cities of Kenya, Tanzania and Uganda.
(b) For the countries above, determine the average area of buildings within 100 kilometers of the capital city.

Because of the huge size of the buildings data, I uploaded it to my google drive from where I could access the datasets from google colab

In [1]:
import pandas as pd

This problem requires libraries for geospatial analysis and distance calculations.

In [None]:
pip install haversine

In [None]:
pip install geopandas

In [4]:
from haversine import haversine, Unit  #Calculates the great-circle distance between two points on the Earth's surface.
import geopandas as gpd #Extends pandas with support for spatial operations on geometries.
from shapely import wkt #Parses Well-Known Text (WKT) strings into geometry objects for spatial analysis.

In [5]:
import gdown #used to download files from Google Drive
import zipfile
import os

In [6]:
cities_df = pd.read_csv('country-capital-lat-long-population.csv')
cities_df.head()

Unnamed: 0,Country,Capital City,Latitude,Longitude,Population,Capital Type
0,Afghanistan,Kabul,34.5289,69.1725,4011770,Capital
1,Albania,Tiranë (Tirana),41.3275,19.8189,475577,Capital
2,Algeria,El Djazaïr (Algiers),36.7525,3.042,2693542,Capital
3,American Samoa,Pago Pago,-14.2781,-170.7025,48526,Capital
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,Capital


In [7]:
#The three countries
Country_Kenya = ['Kenya']
Country_Tanzania = ['Tanzania']
Country_Uganda = ['Uganda']

Kenya_df = cities_df[cities_df['Country'].isin(Country_Kenya)]
Uganda_df = cities_df[cities_df['Country'].isin(Country_Uganda)]

print(Kenya_df)
print(Uganda_df)

# Save the new DataFrames to files
Kenya_df.to_csv("Nairobi.csv", index=False)
Uganda_df.to_csv("Kampala.csv", index=False)

    Country Capital City  Latitude  Longitude  Population Capital Type
111   Kenya      Nairobi   -1.2833    36.8167     4385853      Capital
    Country Capital City  Latitude  Longitude  Population Capital Type
217  Uganda      Kampala    0.3163    32.5822     2986352      Capital


In [8]:
#Since we do not have the properties of Tanzania, we shall create a dataset based
#on the data we get from the internet
Tanzania_df = pd.DataFrame({
    'Country': ['Tanzania'],
    'Capital City': ['Dodoma'],
    'Latitude': [-6.1630],
    'Longitude': [35.7516],
    'Population': [765179],
    'Capital Type': ['Capital']
})

In [9]:
print(Tanzania_df)

    Country Capital City  Latitude  Longitude  Population Capital Type
0  Tanzania       Dodoma    -6.163    35.7516      765179      Capital


In [10]:
Tanzania_df.to_csv("Dodoma.csv", index=False)

i. Kenya

In [11]:
#Define the file ID and file name
file_id = "1CyQ-xz0pPz5NPSfn1oo3R26KwSfYAjQv"  #file ID for Nairobi in google drive
file_name = "Nairobi_buildings.csv.gz"  # Name for the downloaded file

#Download the file from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id}", file_name, quiet=False)

# Load the .csv.gz file into a pandas DataFrame
df_1 = pd.read_csv(file_name, compression="gzip")

print(df_1.head())

Downloading...
From (original): https://drive.google.com/uc?id=1CyQ-xz0pPz5NPSfn1oo3R26KwSfYAjQv
From (redirected): https://drive.google.com/uc?id=1CyQ-xz0pPz5NPSfn1oo3R26KwSfYAjQv&confirm=t&uuid=f4d03435-94a3-466b-b0ee-b5ab7c72bae3
To: /content/Nairobi_buildings.csv.gz
100%|██████████| 1.76G/1.76G [00:14<00:00, 120MB/s]


   latitude  longitude  area_in_meters  confidence  \
0 -0.193490  35.566465         13.6828      0.7385   
1 -0.040530  37.636051         39.3581      0.7824   
2 -0.795178  37.462740         21.6735      0.6809   
3 -1.096869  37.202562          9.9701      0.7498   
4 -2.990276  37.536638         17.2219      0.6932   

                                            geometry full_plus_code  
0  POLYGON((35.5664883709283 -0.193494911934182, ...  6GFQRH48+JH4H  
1  POLYGON((37.6360904683212 -0.040522638535365, ...  6GFVXJ5P+QCMP  
2  POLYGON((37.4627557578247 -0.795203223093872, ...  6GFV6F37+W3HW  
3  POLYGON((37.2025801959259 -1.09687771157486, 3...  6GCVW633+7239  
4  POLYGON((37.5366622482796 -2.9902867428837, 37...  6G9V2G5P+VMJV  


In [12]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17871082 entries, 0 to 17871081
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   latitude        float64
 1   longitude       float64
 2   area_in_meters  float64
 3   confidence      float64
 4   geometry        object 
 5   full_plus_code  object 
dtypes: float64(4), object(2)
memory usage: 818.1+ MB


In [13]:
#Geometries must be valid
if df_1['geometry'].dtype == 'object':  # If geometry is stored as strings
    from shapely import wkt
    df_1['geometry'] = df_1['geometry'].apply(wkt.loads)

#Convert to GeoDataFrame
df_1 = gpd.GeoDataFrame(df_1, geometry='geometry')

#Calculate centroids and extract coordinates
df_1['centroid'] = df_1.geometry.centroid
df_1['latitude'] = df_1['centroid'].y
df_1['longitude'] = df_1['centroid'].x

#Radius calculation function
def count_buildings_near_capital(lat, lon, radius_km):
    capital_coords = (lat, lon)
    df_1['distance_km'] = df_1.apply(
        lambda row: haversine(capital_coords, (row['latitude'], row['longitude']), unit=Unit.KILOMETERS),
        axis=1
    )

    return (df_1['distance_km'] <= radius_km).sum()

#Coordinates of Nairobi city
latitude = Kenya_df.iloc[0]['Latitude']
longitude = Kenya_df.iloc[0]['Longitude']

#Define the radius
radius_km = 100

#Count buildings within the radius
count = count_buildings_near_capital(latitude, longitude, radius_km)

print(f"Buildings within {radius_km} km of {Kenya_df.iloc[0]['Capital City']} ({Kenya_df.iloc[0]['Country']}): {count}")


Buildings within 100 km of Nairobi (Kenya): 6030129


In [14]:
#Filter the buildings within 100 km of Nairobi
buildings_within_radius = df_1[df_1['distance_km'] <= radius_km]

#Calculate the average area
average_area = buildings_within_radius['area_in_meters'].mean()

print(f"Average area of these buildings: {average_area:.2f} square meters")


Average area of these buildings: 61.76 square meters


ii. Tanzania

In [15]:
#Define the file ID and file name
file_id = "1e8276ygPS_ZxkecAIn6Bzvlj7lULsYdM"  #file ID in google drive
file_name = "Dodoma_buildings.csv.gz"  # Name for the downloaded file

#Download the file from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id}", file_name, quiet=False)

# Load the .csv.gz file into a pandas DataFrame
df_2 = pd.read_csv(file_name, compression="gzip")

print(df_2.head())

Downloading...
From (original): https://drive.google.com/uc?id=1e8276ygPS_ZxkecAIn6Bzvlj7lULsYdM
From (redirected): https://drive.google.com/uc?id=1e8276ygPS_ZxkecAIn6Bzvlj7lULsYdM&confirm=t&uuid=ec0ddbb9-bc36-4c0b-9d90-47faa89e9d73
To: /content/Dodoma_buildings.csv.gz
100%|██████████| 871M/871M [00:09<00:00, 90.5MB/s]


   latitude  longitude  area_in_meters  confidence  \
0 -8.155979  36.692185         57.7881      0.8417   
1 -4.332005  39.147017         27.0097      0.6728   
2 -6.231007  36.563664         60.8869      0.8911   
3 -6.008637  34.982150         17.2327      0.7200   
4 -5.420280  37.178479         28.1533      0.7002   

                                            geometry full_plus_code  
0  POLYGON((36.6922195279254 -8.15594482714905, 3...  6G3RRMVR+JV3X  
1  POLYGON((39.1470455057509 -4.33202201827588, 3...  6G7XM49W+5RRP  
2  POLYGON((36.5637113993215 -6.23098370274496, 3...  6G5RQH97+HFVJ  
3  POLYGON((34.9821688932721 -6.00865538580887, 3...  6G5PXXRJ+GVRH  
4  POLYGON((37.1784950210917 -5.42031064981906, 3...  6G6VH5HH+V9QV  


In [16]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8968031 entries, 0 to 8968030
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   latitude        float64
 1   longitude       float64
 2   area_in_meters  float64
 3   confidence      float64
 4   geometry        object 
 5   full_plus_code  object 
dtypes: float64(4), object(2)
memory usage: 410.5+ MB


In [17]:
#Geometries must be valid
if df_2['geometry'].dtype == 'object':  #If geometry is stored as strings
    from shapely import wkt
    df_2['geometry'] = df_2['geometry'].apply(wkt.loads)

#Convert to GeoDataFrame
df_2 = gpd.GeoDataFrame(df_2, geometry='geometry')

#Calculate centroids and extract coordinates
df_2['centroid'] = df_2.geometry.centroid
df_2['latitude'] = df_2['centroid'].y
df_2['longitude'] = df_2['centroid'].x

#Radius calculation function
def count_buildings_near_capital(lat, lon, radius_km):
    capital_coords = (lat, lon)
    df_2['distance_km'] = df_2.apply(
        lambda row: haversine(capital_coords, (row['latitude'], row['longitude']), unit=Unit.KILOMETERS),
        axis=1
    )
    return (df_2['distance_km'] <= radius_km).sum()

#Coordinates of Dodoma city
latitude = Tanzania_df.iloc[0]['Latitude']
longitude = Tanzania_df.iloc[0]['Longitude']

#Define the radius
radius_km = 100

#Count buildings within the radius
count = count_buildings_near_capital(latitude, longitude, radius_km)

print(f"Buildings within {radius_km} km of {Tanzania_df.iloc[0]['Capital City']} ({Tanzania_df.iloc[0]['Country']}): {count}")

Buildings within 100 km of Dodoma (Tanzania): 1056779


In [18]:
#Filter the buildings within 100 km of Dar Es Salaam
buildings_within_radius = df_2[df_2['distance_km'] <= radius_km]

#Calculate the average area
average_area = buildings_within_radius['area_in_meters'].mean()

print(f"Average area of these buildings: {average_area:.2f} square meters")

Average area of these buildings: 47.91 square meters


iii. Uganda

In [19]:
#Define the file ID and file name
file_id = "1oS0e7XoY9F_LIfZXVEJRSgvADA6bYtoM"  #file ID in google drive
file_name = "Kampala_buildings.csv.gz"  # Name for the downloaded file

#Download the file from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id}", file_name, quiet=False)

# Load the .csv.gz file into a pandas DataFrame
df_3 = pd.read_csv(file_name, compression="gzip")

print(df_3.head())

Downloading...
From (original): https://drive.google.com/uc?id=1oS0e7XoY9F_LIfZXVEJRSgvADA6bYtoM
From (redirected): https://drive.google.com/uc?id=1oS0e7XoY9F_LIfZXVEJRSgvADA6bYtoM&confirm=t&uuid=e2b4ca0a-31ff-4757-806e-899cf8b6342d
To: /content/Kampala_buildings.csv.gz
100%|██████████| 1.91G/1.91G [00:19<00:00, 97.4MB/s]


   latitude  longitude  area_in_meters  confidence  \
0  0.194803  34.190104         47.1710      0.7835   
1  0.143153  32.543894         16.0819      0.6964   
2  0.903326  32.568620         61.5599      0.8727   
3  1.934478  33.111609         78.4339      0.8006   
4  1.915876  33.143219         74.8745      0.8621   

                                            geometry full_plus_code  
0  POLYGON((34.1901308273851 0.194841434533873, 3...  6GGP55VR+W2H3  
1  POLYGON((32.5438814083812 0.143124278852618, 3...  6GGJ4GVV+7H64  
2  POLYGON((32.5686660933795 0.903344690538587, 3...  6GGJWH39+8CQ5  
3  POLYGON((33.1116485789278 1.93443731522919, 33...  6GHMW4M6+QJX4  
4  POLYGON((33.1432643079789 1.91584362306832, 33...  6GHMW48V+9752  


In [20]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19944639 entries, 0 to 19944638
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   latitude        float64
 1   longitude       float64
 2   area_in_meters  float64
 3   confidence      float64
 4   geometry        object 
 5   full_plus_code  object 
dtypes: float64(4), object(2)
memory usage: 913.0+ MB


In [21]:
#Ensure geometries are valid
if df_3['geometry'].dtype == 'object':  # If geometry is stored as strings
    from shapely import wkt
    df_3['geometry'] = df_3['geometry'].apply(wkt.loads)

#Convert to GeoDataFrame
df_3 = gpd.GeoDataFrame(df_3, geometry='geometry')

#Calculate centroids and extract coordinates
df_3['centroid'] = df_3.geometry.centroid
df_3['latitude'] = df_3['centroid'].y
df_3['longitude'] = df_3['centroid'].x

#Radius calculation function
def count_buildings_near_capital(lat, lon, radius_km):
    capital_coords = (lat, lon)
    df_3['distance_km'] = df_3.apply(
        lambda row: haversine(capital_coords, (row['latitude'], row['longitude']), unit=Unit.KILOMETERS),
        axis=1
    )
    return (df_3['distance_km'] <= radius_km).sum()

#Coordinates of a kampala city
latitude = Uganda_df.iloc[0]['Latitude']
longitude = Uganda_df.iloc[0]['Longitude']

# Define the radius
radius_km = 100

#Count buildings within the radius
count = count_buildings_near_capital(latitude, longitude, radius_km)

print(f"Buildings within {radius_km} km of {Uganda_df.iloc[0]['Capital City']} ({Uganda_df.iloc[0]['Country']}): {count}")

Buildings within 100 km of Kampala (Uganda): 4371589


In [22]:
#Filter the buildings within 100 km of Kampala
buildings_within_radius = df_3[df_3['distance_km'] <= radius_km]

#Calculate the average area
average_area = buildings_within_radius['area_in_meters'].mean()

print(f"Average area of these buildings: {average_area:.2f} square meters")

Average area of these buildings: 63.53 square meters
