In [11]:
import gzip
import json
import pandas as pd
import numpy as np

# Load the data
with gzip.open('total_data.json.gz', 'rt') as f:
    data = json.load(f)

# Extract elements
elements = data['elements']

# Convert to DataFrame
df = pd.DataFrame(elements)
df.set_index('id', inplace=True)

# Fill missing values with NaN
df[['lat', 'lon']] = df[['lat', 'lon']].apply(pd.to_numeric, errors='coerce')

# Create DataFrame with all nodes that have valid latitude and longitude
pure_nodes_df = df[~df[['lat', 'lon']].isnull().any(axis=1)]

# Define function to determine whether an element is a building
def is_building(tags):
    if tags is not None and not pd.isna(tags) and ('building' in tags or 'amenity' in tags or 'leisure' in tags or 'shop' in tags):
        return True
    else:
        return False


# Apply this function to each row in the DataFrame
df['is_building'] = df['tags'].apply(is_building)

# Filter DataFrame by 'is_building'
building_elements_df = df[df['is_building']]

# # Export to CSV
# pure_nodes_df.to_csv('pure_nodes_df.csv')
# building_elements_df.to_csv('building_elements_df.csv')

print(f'Total elements: {len(elements)}')
print(f'Building elements: {len(building_elements_df)}')
print(f'Pure nodes: {len(pure_nodes_df)}')


Total elements: 339850
Building elements: 46638
Pure nodes: 298188


In [12]:
# Keep only 'lat' and 'lon' columns 
pure_nodes_df = pure_nodes_df[['lat', 'lon']]
pure_nodes_df


Unnamed: 0_level_0,lat,lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1
365687698,49.219983,-122.988481
365786259,49.280256,-122.928894
411900947,49.277918,-122.912361
411900948,49.278119,-122.912285
416105127,49.225474,-122.998558
...,...,...
10947838120,49.243718,-122.892909
10947838125,49.243573,-122.893039
10947838126,49.243859,-122.893031
1540447457,49.248360,-122.893075


In [13]:
categories = ['amenity', 'leisure', 'shop', 'building']

def extract_categories(tags):
    if pd.isna(tags):
        return np.nan, np.nan
    else:
        for category in categories:
            if category in tags:
                return category, tags[category]
        return np.nan, np.nan

# Copy the DataFrame to avoid SettingWithCopyWarning
building_elements_df = building_elements_df.copy()

# Use the .loc method 
building_elements_df.loc[:, 'category'], building_elements_df.loc[:, 'sub_category'] = zip(*building_elements_df['tags'].apply(extract_categories))

# Don't drop 'nodes' column
building_elements_df = building_elements_df.drop(columns=['is_building'])

# Filter out only 'way' types where 'nodes' is NaN
building_elements_df = building_elements_df[~((building_elements_df['type'] == 'way') & (building_elements_df['nodes'].isna()))]

building_elements_df


Unnamed: 0_level_0,type,lat,lon,tags,nodes,members,category,sub_category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
365687698,node,49.219983,-122.988481,"{'amenity': 'post_box', 'collection_times': 'M...",,,amenity,post_box
365786259,node,49.280256,-122.928894,{'amenity': 'telephone'},,,amenity,telephone
411900947,node,49.277918,-122.912361,"{'addr:housenumber': '8901', 'addr:street': 'C...",,,amenity,restaurant
411900948,node,49.278119,-122.912285,"{'addr:housenumber': '8910', 'addr:street': 'U...",,,amenity,cafe
416105127,node,49.225474,-122.998558,"{'access': 'customers', 'amenity': 'parking_en...",,,amenity,parking_entrance
...,...,...,...,...,...,...,...,...
6903757060,node,49.278053,-122.909996,"{'amenity': 'bench', 'backrest': 'no', 'direct...",,,amenity,bench
6903757061,node,49.278099,-122.910300,"{'amenity': 'bench', 'direction': '100'}",,,amenity,bench
6903757062,node,49.278094,-122.910332,"{'amenity': 'bench', 'direction': '190'}",,,amenity,bench
6903757063,node,49.278118,-122.910367,"{'amenity': 'bench', 'direction': '10'}",,,amenity,bench


In [14]:
# Generate name to the missing building
def extract_category_type(tags):
    category = None
    type_ = None
    name = None
    
    if isinstance(tags, dict):
        if 'amenity' in tags:
            category = 'amenity'
            type_ = tags.get('amenity', None)
        if 'building' in tags:
            category = 'building'
            type_ = tags.get('building', None)
        if 'shop' in tags:
            category = 'shop'
            type_ = tags.get('shop', None)
        if 'leisure' in tags:
            category = 'leisure'
            type_ = tags.get('leisure', None)
            
        # Add name for house and apartment based on their address
        name = tags.get('name', None)
        if name is None and 'addr:housenumber' in tags and 'addr:street' in tags:
            name = tags['addr:housenumber'] + ' ' + tags['addr:street']

    return pd.Series([name, category, type_])

# Apply function to the DataFrame
building_elements_df[['Name', 'Category', 'Type']] = building_elements_df['tags'].apply(extract_category_type)



In [15]:
# Filter:
# print(building_elements_df['sub_category'].unique())

relevant_types = ['alcohol', 'apartments', 'arts_centre', 'atm', 'bakery', 'bank', 'bar', 'beauty', 'books', 'bus_station', 'butcher', 'cafe', 'car_repair', 'car_parts', 'car_wash', 'childcare', 'church', 'cinema', 'clinic', 'clothes', 'community_centre', 
                    'confectionery', 'convenience', 'dentist', 'deli', 'doctors', 'electronics', 'fast_food', 'fitness_centre', 'fitness_station', 'florist', 'food_court', 'fuel', 'furniture', 'gift', 'greengrocer', 'grocery', 'hairdresser', 'hospital', 'house', 
                    'ice_cream', 'jewelry', 'kindergarten', 'laundry', 'library', 'mall', 'marketplace', 'mobile_phone', 'optician', 'park', 'pet', 'pharmacy', 'playground', 'post_office', 'pub', 'residential', 'restaurant', 'school', 'sports_centre', 'supermarket', 
                    'swimming_pool', 'theatre', 'toys', 'townhall', 'transportation', 'university', 'variety_store', 'veterinary']

# Filter the DataFrame to only include these types
building_elements_df = building_elements_df[building_elements_df['Type'].isin(relevant_types)]
# building_elements_df.to_csv('building_elements_df.csv')
building_elements_df

Unnamed: 0_level_0,type,lat,lon,tags,nodes,members,category,sub_category,Name,Category,Type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
411900947,node,49.277918,-122.912361,"{'addr:housenumber': '8901', 'addr:street': 'C...",,,amenity,restaurant,Pho 99,amenity,restaurant
411900948,node,49.278119,-122.912285,"{'addr:housenumber': '8910', 'addr:street': 'U...",,,amenity,cafe,Starbucks,amenity,cafe
471726425,node,49.260359,-123.005519,"{'amenity': 'fast_food', 'brand': 'McDonald's'...",,,amenity,fast_food,McDonald's,amenity,fast_food
482696829,node,49.279475,-122.969358,"{'addr:housenumber': '6508', 'addr:street': 'H...",,,amenity,pharmacy,Shoppers Drug Mart,amenity,pharmacy
482696840,node,49.279419,-122.967118,"{'addr:housenumber': '6564', 'addr:street': 'H...",,,shop,supermarket,Safeway,shop,supermarket
...,...,...,...,...,...,...,...,...,...,...,...
15994410,relation,,,"{'addr:city': 'Burnaby', 'addr:housenumber': '...",,"[{'type': 'way', 'ref': 308352593, 'role': 'ou...",building,apartments,Burnaby Center,building,apartments
2633389647,node,49.223734,-122.998752,"{'addr:city': 'Burnaby', 'addr:housenumber': '...",,,amenity,clinic,Total Therapy,amenity,clinic
8986309931,node,49.266657,-123.001316,"{'amenity': 'atm', 'brand': 'Vancity', 'brand:...",,,amenity,atm,Vancity,amenity,atm
1724622640,node,49.225477,-122.990987,"{'addr:city': 'Burnaby', 'addr:housenumber': '...",,,shop,deli,Metrotown Polish Deli,shop,deli


In [17]:
# Function to calculate the center point of a building based on its nodes
def calculate_building_center(row):
    nodes = row['nodes']
    if nodes and isinstance(nodes, list):
        # Filter the nodes DataFrame to only include rows with ids in the list of nodes
        building_nodes = pure_nodes_df.loc[nodes]
        
        # Calculate the mean latitude and longitude
        mean_lat = building_nodes['lat'].mean()
        mean_lon = building_nodes['lon'].mean()
    else:
        mean_lat = row['lat']
        mean_lon = row['lon']
        
    return pd.Series([mean_lat, mean_lon])

# Copy the DataFrame to avoid SettingWithCopyWarning
building_elements_df = building_elements_df.copy()

# Remove the buildings that does not contain address
building_elements_df = building_elements_df[building_elements_df['Name'].notna()]

# Initialize the 'Latitude' and 'Longitude' columns with NaN
building_elements_df['Latitude'] = np.nan
building_elements_df['Longitude'] = np.nan

# Apply the function to each row in the DataFrame
building_elements_df[['Latitude', 'Longitude']] = building_elements_df.apply(calculate_building_center, axis=1)

# Select only the relevant columns
building_elements_df = building_elements_df[['Latitude', 'Longitude', 'Name', 'Category', 'Type', 'tags']]

building_elements_df


Unnamed: 0_level_0,Latitude,Longitude,Name,Category,Type,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
411900947,49.277918,-122.912361,Pho 99,amenity,restaurant,"{'addr:housenumber': '8901', 'addr:street': 'C..."
411900948,49.278119,-122.912285,Starbucks,amenity,cafe,"{'addr:housenumber': '8910', 'addr:street': 'U..."
471726425,49.260359,-123.005519,McDonald's,amenity,fast_food,"{'amenity': 'fast_food', 'brand': 'McDonald's'..."
482696829,49.279475,-122.969358,Shoppers Drug Mart,amenity,pharmacy,"{'addr:housenumber': '6508', 'addr:street': 'H..."
482696840,49.279419,-122.967118,Safeway,shop,supermarket,"{'addr:housenumber': '6564', 'addr:street': 'H..."
...,...,...,...,...,...,...
15994410,,,Burnaby Center,building,apartments,"{'addr:city': 'Burnaby', 'addr:housenumber': '..."
2633389647,49.223734,-122.998752,Total Therapy,amenity,clinic,"{'addr:city': 'Burnaby', 'addr:housenumber': '..."
8986309931,49.266657,-123.001316,Vancity,amenity,atm,"{'amenity': 'atm', 'brand': 'Vancity', 'brand:..."
1724622640,49.225477,-122.990987,Metrotown Polish Deli,shop,deli,"{'addr:city': 'Burnaby', 'addr:housenumber': '..."


In [18]:

building_elements_df = building_elements_df.dropna(subset=['Latitude', 'Longitude'])
building_elements_df.to_csv('building_elements_df.csv')
building_elements_df

Unnamed: 0_level_0,Latitude,Longitude,Name,Category,Type,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
411900947,49.277918,-122.912361,Pho 99,amenity,restaurant,"{'addr:housenumber': '8901', 'addr:street': 'C..."
411900948,49.278119,-122.912285,Starbucks,amenity,cafe,"{'addr:housenumber': '8910', 'addr:street': 'U..."
471726425,49.260359,-123.005519,McDonald's,amenity,fast_food,"{'amenity': 'fast_food', 'brand': 'McDonald's'..."
482696829,49.279475,-122.969358,Shoppers Drug Mart,amenity,pharmacy,"{'addr:housenumber': '6508', 'addr:street': 'H..."
482696840,49.279419,-122.967118,Safeway,shop,supermarket,"{'addr:housenumber': '6564', 'addr:street': 'H..."
...,...,...,...,...,...,...
1188355190,49.228410,-122.941400,7855 Elwell Street,building,house,"{'addr:housenumber': '7855', 'addr:street': 'E..."
2633389647,49.223734,-122.998752,Total Therapy,amenity,clinic,"{'addr:city': 'Burnaby', 'addr:housenumber': '..."
8986309931,49.266657,-123.001316,Vancity,amenity,atm,"{'amenity': 'atm', 'brand': 'Vancity', 'brand:..."
1724622640,49.225477,-122.990987,Metrotown Polish Deli,shop,deli,"{'addr:city': 'Burnaby', 'addr:housenumber': '..."


In [19]:
# Categorize each type
# Ensure 'building_elements_df' is a standalone DataFrame
building_elements_df = building_elements_df.copy()

type_to_category = {
    'restaurant': 'food_beverage',
    'cafe': 'food_beverage',
    'fast_food': 'food_beverage',
    'alcohol': 'food_beverage',
    'pub': 'food_beverage',
    'bar': 'food_beverage',
    'food_court': 'food_beverage',
    'bakery': 'food_beverage',
    'deli': 'food_beverage',
    'confectionery': 'food_beverage',
    'ice_cream': 'food_beverage',
    
    'pharmacy': 'market',
    'supermarket': 'market',
    'greengrocer': 'market',
    'butcher': 'market',
    'convenience': 'market',
    'grocery': 'market',
    'marketplace': 'market',
    'variety_store': 'market',
    
    'fuel': 'service',
    'bank': 'service',
    'post_office': 'service',
    'atm': 'service',
    'laundry': 'service',
    'car_repair': 'service',
    'car_wash': 'service',
    'dentist': 'service',
    'doctors': 'service',
    'clinic': 'service',
    'hospital': 'service',
    'veterinary': 'service',
    'optician': 'service',
    'hairdresser': 'service',
    'beauty': 'service',
    'bus_station': 'service',
    'transportation': 'service',
    
    'books': 'shops',
    'furniture': 'shops',
    'clothes': 'shops',
    'pet': 'shops',
    'mobile_phone': 'shops',
    'jewelry': 'shops',
    'electronics': 'shops',
    'toys': 'shops',
    'florist': 'shops',
    'gift': 'shops',
    
    'park': 'leisure',
    'sports_centre': 'leisure',
    'fitness_centre': 'leisure',
    'fitness_station': 'leisure',
    'playground': 'leisure',
    'swimming_pool': 'leisure',
    
    'library': 'entertainment',
    'arts_centre': 'entertainment',
    'theatre': 'entertainment',
    'cinema': 'entertainment',
    'mall': 'entertainment',
    
    'university': 'education',
    'school': 'education',
    'kindergarten': 'education',
    'childcare': 'education',
    
    'community_centre': 'public',
    'townhall': 'public',
    'church': 'public',
}

# Convert types to categories
building_elements_df['New_Category'] = building_elements_df['Type'].map(type_to_category).fillna(building_elements_df['Type'])

# Save DataFrame to CSV
building_elements_df.to_csv('relevant_buildings.csv', index=False)

building_elements_df


Unnamed: 0_level_0,Latitude,Longitude,Name,Category,Type,tags,New_Category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
411900947,49.277918,-122.912361,Pho 99,amenity,restaurant,"{'addr:housenumber': '8901', 'addr:street': 'C...",food_beverage
411900948,49.278119,-122.912285,Starbucks,amenity,cafe,"{'addr:housenumber': '8910', 'addr:street': 'U...",food_beverage
471726425,49.260359,-123.005519,McDonald's,amenity,fast_food,"{'amenity': 'fast_food', 'brand': 'McDonald's'...",food_beverage
482696829,49.279475,-122.969358,Shoppers Drug Mart,amenity,pharmacy,"{'addr:housenumber': '6508', 'addr:street': 'H...",market
482696840,49.279419,-122.967118,Safeway,shop,supermarket,"{'addr:housenumber': '6564', 'addr:street': 'H...",market
...,...,...,...,...,...,...,...
1188355190,49.228410,-122.941400,7855 Elwell Street,building,house,"{'addr:housenumber': '7855', 'addr:street': 'E...",house
2633389647,49.223734,-122.998752,Total Therapy,amenity,clinic,"{'addr:city': 'Burnaby', 'addr:housenumber': '...",service
8986309931,49.266657,-123.001316,Vancity,amenity,atm,"{'amenity': 'atm', 'brand': 'Vancity', 'brand:...",service
1724622640,49.225477,-122.990987,Metrotown Polish Deli,shop,deli,"{'addr:city': 'Burnaby', 'addr:housenumber': '...",food_beverage
