# Importing and preparing supermarkets data (Task 3)

## Libraries and settings

In [1]:
# Libraries
import os
import fnmatch
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print('Current working directory:', os.getcwd())

# Show .json files in the current working directory
flist = fnmatch.filter(os.listdir('.'), '*.json')
for i in flist:
    print(i)

Current working directory: /Users/john/School/Sem5/DA/data_analytics/Week_02
supermarkets.json


## Importing data

In [2]:
# Read the data to a pandas data frame
df1 = pd.read_json('supermarkets.json', encoding='utf-8')
df1.head(5)

Unnamed: 0,type,id,lat,lon,tags
0,node,33126515,47.155616,9.037915,"{'brand': 'Spar', 'brand:wikidata': 'Q610492',..."
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2..."
2,node,39768209,47.225069,8.969981,"{'addr:city': 'Uznach', 'addr:postcode': '8730..."
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ..."
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7..."


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df1.shape)

# Number of rows
print('Number of rows:', df1.shape[0])

# Number of columns
print('Number of columns:', df1.shape[1])

Dimension: (3392, 5)
Number of rows: 3392
Number of columns: 5


## Column 'tags' is a pandas Series with dictionaries -> change to data frame

In [5]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2 = df2[['brand', 'shop', 'addr:city', 'addr:street', 'addr:housenumber', 'addr:postcode']]

# Rename selected columns
df2 = df2.rename(columns={'addr:city': 'city',
                          'addr:street':'street',
                          'addr:housenumber': 'housenumber',
                          'addr:postcode': 'postcode'})

# Show first records of data frame
df2.head()

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['brand', 'brand:wikidata', 'brand:wikipedia', 'name', 'opening_hours', 'shop'])


Unnamed: 0,brand,shop,city,street,housenumber,postcode
0,Spar,supermarket,,,,
1,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0
2,Coop,supermarket,Uznach,,,8730.0
3,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0
4,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0


## Merge df1 and df2

In [6]:
# Merge df and df2
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'city', 'street', 'housenumber', 'postcode']],
              left_index=True, 
              right_index=True)
df.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,
1,node,36726161,47.226191,8.980329,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0
2,node,39768209,47.225069,8.969981,Coop,supermarket,Uznach,,,8730.0
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0


## Count and identify the number of missing values (if any)

In [7]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values, e.g.:
df.loc[pd.isna(df['city'])]

type              0
id                0
lat               0
lon               0
brand          1065
shop              0
city           1777
street         1608
housenumber    1680
postcode       1709
dtype: int64


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,
5,node,60271452,47.406671,9.305450,,supermarket,,,,
6,node,70656485,47.491253,8.733981,,supermarket,,,,
10,node,81321513,47.532917,9.066408,Landi,supermarket,,,,
13,node,95582038,47.050385,9.059214,,supermarket,,,,
...,...,...,...,...,...,...,...,...,...,...
3384,node,11083317088,46.862184,9.531169,Lidl,supermarket,,,,
3386,node,11098091830,46.205111,6.130174,Coop,supermarket,,,,
3387,node,11099817248,46.928691,7.561873,,supermarket,,,,
3388,node,11103235832,46.166742,8.771970,Migros,supermarket,,,,


## Count and identify duplicated values (if any)

In [8]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['id']].duplicated()]

0


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode


## Get data types of all variables

In [9]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

type            object
id               int64
lat            float64
lon            float64
brand           object
shop            object
city            object
street          object
housenumber     object
postcode        object
dtype: object

### Save data to file

In [10]:
df.to_csv('supermarkets_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [11]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 23.0.0
Datetime: 2023-09-29 18:58:50
Python Version: 3.9.6
-----------------------------------


# Additional filters on supermarkets

### Filter only Migros supermarkets in the city of Zürich. (Subtask f)

In [16]:
df_filtered = df.loc[(df['brand'] == 'Migros') & (df['city'] == 'Zürich')]

# Print the filtered DataFrame
print(df_filtered)

      type          id        lat       lon   brand         shop    city  \
4     node    48932835  47.375020  8.522895  Migros  supermarket  Zürich   
11    node    83330862  47.344749  8.529981  Migros  supermarket  Zürich   
16    node   119249170  47.375255  8.536107  Migros  supermarket  Zürich   
50    node   262400822  47.364072  8.530945  Migros  supermarket  Zürich   
71    node   267346993  47.385598  8.531471  Migros  supermarket  Zürich   
82    node   270958272  47.358367  8.554074  Migros  supermarket  Zürich   
83    node   271028298  47.365678  8.548041  Migros  supermarket  Zürich   
85    node   271029206  47.364596  8.553846  Migros  supermarket  Zürich   
89    node   273942728  47.357610  8.571369  Migros  supermarket  Zürich   
192   node   310133197  47.419522  8.548286  Migros  supermarket  Zürich   
208   node   312838980  47.379200  8.508799  Migros  supermarket  Zürich   
224   node   321361643  47.392553  8.538428  Migros  supermarket  Zürich   
271   node  

### Filter and count all Coop supermarkets in the cities of Zürich, Basel & Bern. (Subtask g)

In [22]:
# Count the number of Coop supermarkets in the filtered DataFrame
coop_count = len(df_filtered)

# Print the filtered DataFrame and the count
print("Filtered DataFrame:")
print(df_filtered)

print("\nNumber of Coop supermarkets in Zürich, Basel, and Bern:", coop_count)

Filtered DataFrame:
      type           id        lat       lon brand         shop    city  \
3     node     39947904  47.376732  8.542161  Coop  supermarket  Zürich   
9     node     79977755  47.340070  8.530546  Coop  supermarket  Zürich   
59    node    265776668  47.376417  8.559594  Coop  supermarket  Zürich   
63    node    266630559  47.377716  8.511219  Coop  supermarket  Zürich   
70    node    267345511  47.385809  8.516574  Coop  supermarket  Zürich   
72    node    267468996  47.364872  8.521006  Coop  supermarket  Zürich   
75    node    268603429  47.367360  8.546174  Coop  supermarket  Zürich   
81    node    270692983  47.357940  8.554646  Coop  supermarket  Zürich   
84    node    271028686  47.366773  8.548079  Coop  supermarket  Zürich   
86    node    271029581  47.364300  8.555129  Coop  supermarket  Zürich   
96    node    276363821  47.418888  8.505699  Coop  supermarket  Zürich   
121   node    283103824  47.393648  8.529543  Coop  supermarket  Zürich   
122  

### Filter supermarkets with available brand, city, house number and postcode. (Subtask h)

In [31]:
# Filter supermarkets with non-missing values for brand, city, house number, and postcode
filtered_df = df[df['brand'].notna() & df['city'].notna() & df['housenumber'].notna() & df['postcode'].notna()]

# Print the filtered DataFrame
print("Filtered DataFrame:")
print(filtered_df)


Filtered DataFrame:
      type           id        lat       lon   brand         shop  \
1     node     36726161  47.226191  8.980329  Migros  supermarket   
3     node     39947904  47.376732  8.542161    Coop  supermarket   
4     node     48932835  47.375020  8.522895  Migros  supermarket   
7     node     70656488  47.491874  8.706448  Migros  supermarket   
8     node     75749133  47.340967  8.530601    ALDI  supermarket   
...    ...          ...        ...       ...     ...          ...   
3350  node  10814018169  47.353857  8.436716    Coop  supermarket   
3362  node  10982669725  47.349782  8.258690    ALDI  supermarket   
3370  node  11025130806  47.059301  7.620697  Denner  supermarket   
3378  node  11049758254  47.338327  8.520261    Spar  supermarket   
3385  node  11096932868  46.309025  7.969107  Denner  supermarket   

                  city          street housenumber postcode  
1               Uznach  Zürcherstrasse          25     8730  
3               Zürich   Ba

### Include opening hours as additional variable in the data frame. (Subtask i)

In [32]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2 = df2[['brand', 'shop', 'addr:city', 'addr:street', 'addr:housenumber', 'addr:postcode', 'opening_hours']]

# Rename selected columns
df2 = df2.rename(columns={'addr:city': 'city',
                          'addr:street':'street',
                          'addr:housenumber': 'housenumber',
                          'addr:postcode': 'postcode'})

# Show first records of data frame
df2.head()

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['brand', 'brand:wikidata', 'brand:wikipedia', 'name', 'opening_hours', 'shop'])


Unnamed: 0,brand,shop,city,street,housenumber,postcode,opening_hours
0,Spar,supermarket,,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
1,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,Coop,supermarket,Uznach,,,8730.0,
3,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0,Mo-Sa 06:00-22:00
4,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0,Mo-Sa 08:00-21:00; PH off


### Filter supermarkets with available opening hours. (Subtask j)

In [39]:
# Merge df and df2
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'city', 'street', 'housenumber', 'postcode', 'opening_hours']],
              left_index=True, 
              right_index=True)
df.head(5)

# Filter supermarkets with non-missing values for brand, city, house number, and postcode, available opening hours
filtered_df = df[df['brand'].notna() & df['city'].notna() & df['housenumber'].notna() & df['postcode'].notna() & df['opening_hours'].notna()]

# Print the filtered DataFrame
print("Filtered DataFrame:")
print(filtered_df)


Filtered DataFrame:
      type           id        lat       lon   brand         shop  \
1     node     36726161  47.226191  8.980329  Migros  supermarket   
3     node     39947904  47.376732  8.542161    Coop  supermarket   
4     node     48932835  47.375020  8.522895  Migros  supermarket   
7     node     70656488  47.491874  8.706448  Migros  supermarket   
8     node     75749133  47.340967  8.530601    ALDI  supermarket   
...    ...          ...        ...       ...     ...          ...   
3349  node  10814018168  47.354212  8.436737  Migros  supermarket   
3350  node  10814018169  47.353857  8.436716    Coop  supermarket   
3362  node  10982669725  47.349782  8.258690    ALDI  supermarket   
3370  node  11025130806  47.059301  7.620697  Denner  supermarket   
3378  node  11049758254  47.338327  8.520261    Spar  supermarket   

                  city          street housenumber postcode  \
1               Uznach  Zürcherstrasse          25     8730   
3               Zürich   