# Importing and preparing supermarkets data

## Libraries and settings

In [2]:
# Libraries
import os
import fnmatch
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print('Current working directory:', os.getcwd())

# Show .json files in the current working directory
flist = fnmatch.filter(os.listdir('.'), '*.json')
for i in flist:
    print(i)

Current working directory: /workspaces/remo-caprice-project/Grundlagen
supermarkets.json


## Importing data

In [8]:
# Read the data to a pandas data frame
df1 = pd.read_json('./supermarkets.json', encoding='utf-8')
df1.head(5)

Unnamed: 0,type,id,lat,lon,tags
0,node,33126515,47.155616,9.037915,"{'addr:city': 'Schänis', 'addr:country': 'CH',..."
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2..."
2,node,39768209,47.225154,8.969868,"{'addr:city': 'Uznach', 'addr:postcode': '8730..."
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ..."
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7..."


## Count number of rows and columns in the data frame

In [9]:
# Dimension (rows, columns)
print('Dimension:', df1.shape)

# Number of rows
print('Number of rows:', df1.shape[0])

# Number of columns
print('Number of columns:', df1.shape[1])

Dimension: (3469, 5)
Number of rows: 3469
Number of columns: 5


## Column 'tags' is a pandas Series with dictionaries -> change to data frame

In [10]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2 = df2[['brand', 'shop', 'addr:city', 'addr:street', 'addr:housenumber', 'addr:postcode', 'opening_hours']]

# Rename selected columns
df2 = df2.rename(columns={'addr:city': 'city',
                          'addr:street':'street',
                          'addr:housenumber': 'housenumber',
                          'addr:postcode': 'postcode',
                          'opening_hours': 'opening_hours'})

# Show first records of data frame
df2.head()

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['addr:city', 'addr:country', 'addr:housenumber', 'addr:postcode', 'addr:street', 'brand', 'brand:wikidata', 'brand:wikipedia', 'check_date:currency:XBT', 'currency:XBT', 'email', 'name', 'opening_hours', 'payment:lightning', 'payment:lightning_contactless', 'payment:onchain', 'phone', 'shop', 'website'])


Unnamed: 0,brand,shop,city,street,housenumber,postcode,opening_hours
0,Spar,supermarket,Schänis,Biltnerstrasse,32.0,8718,Mo-Th 07:30-19:00; Fr 07:30-20:00; Sa 07:30-17:00
1,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,Coop,supermarket,Uznach,,,8730,
3,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001,Mo-Sa 06:00-22:00
4,Migros,supermarket,Zürich,Wengistrasse,7.0,8004,Mo-Sa 08:00-21:00; PH off


## Merge df1 and df2

In [11]:
# Merge df and df2
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'city', 'street', 'housenumber', 'postcode', 'opening_hours']],
              left_index=True, 
              right_index=True)
df.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours
0,node,33126515,47.155616,9.037915,Spar,supermarket,Schänis,Biltnerstrasse,32.0,8718,Mo-Th 07:30-19:00; Fr 07:30-20:00; Sa 07:30-17:00
1,node,36726161,47.226191,8.980329,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,node,39768209,47.225154,8.969868,Coop,supermarket,Uznach,,,8730,
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001,Mo-Sa 06:00-22:00
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,Wengistrasse,7.0,8004,Mo-Sa 08:00-21:00; PH off


## Count and identify the number of missing values (if any)

In [12]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values, e.g.:
df.loc[pd.isna(df['city'])]

type                0
id                  0
lat                 0
lon                 0
brand             756
shop                0
city             1691
street           1544
housenumber      1601
postcode         1623
opening_hours     872
dtype: int64


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours
5,node,70656485,47.491253,8.733981,,supermarket,,,,,
9,node,81321513,47.532917,9.066408,Landi,supermarket,,,,,"Mo-Sa 08:00-12:00, 13:30-18:00"
14,node,112148390,47.448772,8.215075,,supermarket,,,,,
16,node,121994447,47.545863,7.583605,Pam,supermarket,,,,,
20,node,128269749,47.381684,8.238918,,supermarket,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
3461,node,13275370001,46.995038,6.943915,,supermarket,,,,,
3462,node,13285536907,46.213355,6.129913,MM Migros,supermarket,,,,,
3464,node,13302166763,47.369876,8.508189,Müller,supermarket,,,,,
3465,node,13302401281,47.203519,8.758658,,supermarket,,,,,Mo-Sa 07:00-20:00


## Count and identify duplicated values (if any)

In [13]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['id']].duplicated()]

0


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours


## Get data types of all variables

In [14]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

type              object
id                 int64
lat              float64
lon              float64
brand             object
shop              object
city              object
street            object
housenumber       object
postcode          object
opening_hours     object
dtype: object

## Filter the supermarkets

In [15]:
# additional filters on supermarkets
df_filtered_1= df.loc[(df['brand'] == 'Migros') & (df['city'] =='Zürich')]
df_filtered_2 = df.loc[(df['brand'] == 'Coop') & (df['city'].isin(['Zürich', 'Bern', 'Basel']))]

df_filtered_1.head()
df_filtered_2.head()

Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,Bahnhofbrücke,1,8001,Mo-Sa 06:00-22:00
8,node,79977755,47.34007,8.530546,Coop,supermarket,Zürich,Alte Kalchbühlstrasse,15,8038,Mo-Sa 07:30-21:00; PH off
58,node,265776668,47.376417,8.559594,Coop,supermarket,Zürich,Zürichbergstrasse,75,8044,Mo-Fr 07:00-21:00; Sa 07:30-20:00
62,node,266630559,47.377716,8.511219,Coop,supermarket,Zürich,Badenerstrasse,333,8003,Mo-Sa 07:00-22:00; PH off
70,node,267468996,47.364872,8.521006,Coop,supermarket,Zürich,Uetlibergstrasse,20,8045,Mo-Sa 07:30-21:00


## Count all Coop supermarkets in Zurich, Bern & Basel

In [16]:
# Filter all Coop supermarkets in Zurich, Bern & Basel
df_coop = df.loc[(df['brand'] == 'Coop') & (df['city'].isin(['Zürich', 'Bern', 'Basel']))]
print('Number of Coop supermarkets in Zurich, Bern & Basel:', df_coop.shape[0])

Number of Coop supermarkets in Zurich, Bern & Basel: 64


## Filter Supermarkets with availabel brand, city, house number, postcode and opening hours

In [17]:
# Filter Supermarkets with availabel brand, city, house number, postcode and opening hours 
df_cleaned = df.dropna(subset=['brand', 'city', 'housenumber', 'postcode', 'opening_hours'])
print('Dimension of cleaned data frame:', df_cleaned.shape)

Dimension of cleaned data frame: (1195, 11)


## Count all supermarkets with all available data

In [18]:
# Count all supermarkets with all available data
print('Number of supermarkets with all available data:', df_cleaned.shape[0])


Number of supermarkets with all available data: 1195


### Save data to file

In [20]:
df.to_csv('./supermarkets_data_filtered.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [21]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1030-azure
Datetime: 2025-11-28 10:00:56
Python Version: 3.11.14
-----------------------------------
