# < Meaningful title >

##### Imports

In [2]:
# import requests
# import googlemaps
# import pandas as pd
# import numpy as np
# #from geopy.distance import geodesic
# import json
# #import folium
# import time
# from datetime import datetime
# from itertools import permutations
# #from geopy.distance import great_circle
# from io import StringIO

import imports as i
import importlib
importlib.reload(i)

# Custom util functions
# import sys; sys.path.append("./libraries/")
# from utils import *

  from .autonotebook import tqdm as notebook_tqdm


<module 'imports' from 'c:\\Users\\Veronii\\Desktop\\data-wild-west\\code\\imports.py'>

### Settings

##### Reproducibility settings

In [3]:
# Random seed
i.np.random.seed = 7

# Relative Paths
raw_data = "../data/raw_data/"
process_data = "../data/process_data"

# Flags
collect = False # Flag to collect data or load existent raw_data

##### Google API

In [4]:
key = open("./Google_API_key.txt").readline()
gmaps = i.googlemaps.Client(key=key)

FileNotFoundError: [Errno 2] No such file or directory: './Google_API_key.txt'

# 1. Data Collection

We start by creating a list of query values that relate to the dataset. We are interested in getting mostly reviews (and some other metadata) on specific fitness facilities (i.e. popular chains) from main cities in Denmark. To do this, we will compute the query list as a combination of cities and fitness chains. 

In [5]:
# List of cities
cities = ['Copenhagen', 'Aalborg', 'Arhus', 'Odense']
 
# Popular fitness chains
gyms = ["PureGym", "SATS", "Vesterbronx"]

# Query list
query_list = [g + " " + c for g in gyms for c in cities]

print(query_list)

['PureGym Copenhagen', 'PureGym Aalborg', 'PureGym Arhus', 'PureGym Odense', 'SATS Copenhagen', 'SATS Aalborg', 'SATS Arhus', 'SATS Odense', 'Vesterbronx Copenhagen', 'Vesterbronx Aalborg', 'Vesterbronx Arhus', 'Vesterbronx Odense']


## 1.1 Google Maps API

The Google maps API takes a single query string to search for results (similar to the User Interface searchbox). Therefore, we combine popular fitness facilities with main Danish cities as our query keys.

### 1.1.1 Reviews
We start by getting the reviews for our query list.

Get responses for all the queries from the API

In [7]:
if collect:
    # Get response for queries
    dfs = []

    # For each query in the query list
    for query in query_list:  
        # Get the response using our custom made querier
        dfs.append(i.google_querier(gmaps, query))

    google_reviews = i.pd.concat(dfs)

    # Save to disk
    google_reviews.to_csv(raw_data + "google_reviews.csv", index=False, encoding="utf-8")

else:
    google_reviews = i.pd.read_csv(raw_data + "google_reviews.csv")

Check the results.

In [8]:
check_dataframe_results(google_reviews)

NameError: name 'check_dataframe_results' is not defined

### 1.1.2 Nearby Transportation
We are interested in collecting the nearby transportation to the fitness centers.

In [9]:
if collect:

    # Radius of search in meters
    radius = 500

    # Transportation type key (similar to what one would input in Google Maps search box)
    transportation_type = ['bus_station', 'train_station', 'transit_station'] # Avaliable transportation: only bus station, train station and transit station (which includes metro)

    # Container
    nearby_transportation = []

    # We iterate through all our fitness centers, and retrieve nearby transportations
    for ix, row in google_reviews.iterrows():
        # Extract info from fitness center
        place_id = row.place_id
        location = {"lat": row.lat, "lng": row.lng}
        # Look at nearby transportation
        df = google_nearby(gmaps, place_id = place_id, keys = transportation_type, location = location, radius = radius)
        # Append results
        nearby_transportation.append(df)

    # Join all results
    nearby_transportation = i.pd.concat(nearby_transportation)

    # Save to disk
    nearby_transportation.to_csv(raw_data + "transportation.csv", index=False, encoding="utf-8")

else:
    nearby_transportation = i.pd.read_csv(raw_data + "transportation.csv")

Check the results.

In [10]:
check_dataframe_results(nearby_transportation)

NameError: name 'check_dataframe_results' is not defined

## 1.2 Trustpilot WebCrawler

Trustpilot is a Danish consumer review website very popular in Denmark. It is publicly available and easy to access, but it does not provide any API integration. Therefore, we use a simple webcrawler to extract the reviews of interest.

In [11]:
if collect:
    dfs = []

    # Reuse the gyms
    for g in gyms:
        df = trustpilot_crawler(key=g, verbose=False)

        # Append the facility DF to main df
        dfs.append(df)

    # Join all DFs
    trustpilot_reviews = i.pd.concat(dfs)

    # Save to disk
    trustpilot_reviews.to_csv(raw_data + "trustpilot_reviews.csv", index=False, encoding="utf-8")

else:
    trustpilot_reviews = i.pd.read_csv(raw_data + "trustpilot_reviews.csv")

Check the results.

In [12]:
check_dataframe_results(trustpilot_reviews)

NameError: name 'check_dataframe_results' is not defined

## 1.3 Københavns Kommune WebCrawler

The Københavns Kommune website provides an extensive list of training facilities, both indoors and outdoors. Since this is a dynamic site built on JavaScript, the traditional webcrawler approach is not suitable, and thus we will use an approach that simulates human-like interactions using Selenium.

In [13]:
if collect:

    # Create crawler instance
    kbh_crawler = KBHFacilitiesWebCrawler()
    # Get dataframe with entries
    kbh_facilities = kbh_crawler.get()

    # Save to disk
    kbh_facilities.to_csv(raw_data + "kbh_facilities.csv", index=False, encoding="utf-16") # Since some Danish characters don't map to utf-8, we use utf-16
    

else:
    kbh_facilities = i.pd.read_csv(raw_data + "kbh_facilities.csv", encoding="utf-16")

Check the results.

In [14]:
check_dataframe_results(kbh_facilities)

NameError: name 'check_dataframe_results' is not defined

We observe that this dataset only contains addresses, but not geolocation (latitude and longitude). We then try to collect that data from Google Maps.

In [15]:
if collect:
    # We create the new columns
    kbh_facilities['lat'] = None
    kbh_facilities['lng'] = None

    # Subset only facilities with either address OR location
    for index, row in kbh_facilities[~kbh_facilities.isna()].iterrows():

        #look first if we can find lat and lng for the address
        lat_lng = get_lat_lng(gmaps, row.address)
        
        # if there is no address or we can't find the coordiantes using address we try the location
        if not lat_lng:
            lat_lng = get_lat_lng(gmaps, row.location)
            
        # if none of location and address works we continue with the next (leave it at None)
        if not lat_lng:
            continue

        kbh_facilities.at[index, 'lat'] = lat_lng[0]
        kbh_facilities.at[index, 'lng'] = lat_lng[1]

    # Save to disk
    kbh_facilities.to_csv(raw_data + "kbh_facilities.csv", index=False, encoding="utf-16") # Since some Danish characters don't map to utf-8, we use utf-16

else:
    kbh_facilities = i.pd.read_csv(raw_data + "kbh_facilities.csv", encoding="utf-16")

Check the results.

In [16]:
check_dataframe_results(kbh_facilities)

NameError: name 'check_dataframe_results' is not defined