# < Meaningful title >

##### Imports

In [1]:
# Google Maps API
import googlemaps

# Basic libraries
import pandas as pd
import numpy as np
import os

# Custom util functions
import sys; sys.path.append("./libraries/")
from utils import *

import warnings; warnings.filterwarnings('ignore')

### Settings

##### Reproducibility settings

In [2]:
# Random seed
np.random.seed = 7

# Relative Paths
RAW_DATA = "../data/raw_data/"
PROCESSED_DATA = "../data/processed_data/"
ANNOTATIONS_DATA = "../annotations/"

# Flags
collect = False # Flag to collect data or load existent raw_data
process = False # Flag to process data or load existent processed data

##### Google API

In [3]:
key = open("./Google_API_key.txt").readline()
gmaps = googlemaps.Client(key=key)

#api = None # enter API key here
#gmaps = googlemaps.Client(api)

# 1. Data Collection

We start by creating a list of query values that relate to the dataset. We are interested in getting mostly reviews (and some other metadata) on specific fitness facilities (i.e. popular chains) from main cities in Denmark. To do this, we will compute the query list as a combination of cities and fitness chains. 

In [4]:
# List of cities
cities = ['Copenhagen', 'Aalborg', 'Arhus', 'Odense']
 
# Popular fitness chains
gyms = ["PureGym", "SATS", "Vesterbronx"]

# Query list
query_list = [g + " " + c for g in gyms for c in cities]

print(query_list)

['PureGym Copenhagen', 'PureGym Aalborg', 'PureGym Arhus', 'PureGym Odense', 'SATS Copenhagen', 'SATS Aalborg', 'SATS Arhus', 'SATS Odense', 'Vesterbronx Copenhagen', 'Vesterbronx Aalborg', 'Vesterbronx Arhus', 'Vesterbronx Odense']


## 1.1 Google Maps API

The Google maps API takes a single query string to search for results (similar to the User Interface searchbox). Therefore, we combine popular fitness facilities with main Danish cities as our query keys.

### 1.1.1 Reviews
We start by getting the reviews for our query list.

Get responses for all the queries from the API

In [5]:
if collect:
    # Get response for queries
    dfs = []

    # For each query in the query list
    for query in query_list:  
        # Get the response using our custom made querier
        dfs.append(google_querier(gmaps, query))

    google_reviews = pd.concat(dfs)

    # Save to disk
    google_reviews.to_csv(RAW_DATA + "google_reviews.csv", index=False, encoding="utf-8")

else:
    google_reviews = pd.read_csv(RAW_DATA + "google_reviews.csv")

Check the results.

In [6]:
check_dataframe_results(google_reviews)

Resulting dataframe has shape (360, 9)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   place_id       360 non-null    object 
 1   type           360 non-null    object 
 2   name           360 non-null    object 
 3   lat            360 non-null    float64
 4   lng            360 non-null    float64
 5   author_name    360 non-null    object 
 6   rating         360 non-null    int64  
 7   text           360 non-null    object 
 8   opening_hours  360 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 25.4+ KB
None


Unnamed: 0,place_id,type,name,lat,lng,author_name,rating,text,opening_hours
0,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,madi sharp,4,"Sweet small gym, staff are kind when you see t...","{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
1,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Lewis Atkins,2,"Just a very bad gym. Staff don’t really care, ...","{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
2,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Eric,1,"terrible facilities\nbathrooms are gross, dirt...","{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
3,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Rune Perstrup,1,An Unhygienic Coronavirus Petri Dish.\n\nI hav...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
4,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Mario Piazza,1,In a huge gym there is only one hair dryer and...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."


### 1.1.2 Nearby Transportation
We are interested in collecting the nearby transportation to the fitness centers.

In [7]:
if collect:

    # Radius of search in meters
    radius = 500

    # Transportation type key (similar to what one would input in Google Maps search box)
    transportation_type = ['bus_station', 'train_station', 'transit_station'] # Avaliable transportation: only bus station, train station and transit station (which includes metro)

    # Container
    nearby_transportation = []

    # We iterate through all our fitness centers, and retrieve nearby transportations
    for ix, row in google_reviews.iterrows():
        # Extract info from fitness center
        place_id = row.place_id
        location = {"lat": row.lat, "lng": row.lng}
        # Look at nearby transportation
        df = google_nearby(gmaps, place_id = place_id, keys = transportation_type, location = location, radius = radius)
        # Append results
        nearby_transportation.append(df)

    # Join all results
    nearby_transportation = pd.concat(nearby_transportation)

    # Save to disk
    nearby_transportation.to_csv(RAW_DATA + "transportation.csv", index=False, encoding="utf-8")

else:
    nearby_transportation = pd.read_csv(RAW_DATA + "transportation.csv")

Check the results.

In [8]:
check_dataframe_results(nearby_transportation)

Resulting dataframe has shape (6195, 7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   place_id                6195 non-null   object 
 1   transport_id            6195 non-null   object 
 2   transport_name          6195 non-null   object 
 3   transport_type          6195 non-null   object 
 4   transport_lat           6195 non-null   float64
 5   transport_lng           6195 non-null   float64
 6   distance_gym_transport  6195 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 338.9+ KB
None


Unnamed: 0,place_id,transport_id,transport_name,transport_type,transport_lat,transport_lng,distance_gym_transport
0,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJ-y92w3VTUkYRY8NOJNuwQkQ,Kridt v/Rikke Frisk,bus_station,55.668036,12.551084,305
1,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJ76k9hJ9TUkYRRaHYcLRX3XE,Lysholdet v/Jakob Holst,bus_station,55.672786,12.547279,331
2,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJydMagp9TUkYRjfgBz2sKErQ,"Ejerforeningen Sigbrits Allé 3, 5 og 5a",bus_station,55.672865,12.546713,343
3,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJydMagp9TUkYRcgSv8LkczNs,Grundejerforeningen Carl Jacobsens Vej 33-41,bus_station,55.672865,12.546713,343
4,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJl46JnZ5TUkYRnW93t25gFhQ,Optiperform v/Maja Juel-Hansen,bus_station,55.668011,12.542917,346


## 1.2 Trustpilot WebScraper

Trustpilot is a Danish consumer review website very popular in Denmark. It is publicly available and easy to access, but it does not provide any API integration. Therefore, we use a simple webcrawler to extract the reviews of interest.

In [9]:
if collect:
    dfs = []

    # Reuse the gyms
    for g in gyms:
        df = trustpilot_crawler(key=g, verbose=False)

        # Append the facility DF to main df
        dfs.append(df)

    # Join all DFs
    trustpilot_reviews = pd.concat(dfs)

    # Save to disk
    trustpilot_reviews.to_csv(RAW_DATA + "trustpilot_reviews.csv", index=False, encoding="utf-8")

else:
    trustpilot_reviews = pd.read_csv(RAW_DATA + "trustpilot_reviews.csv", encoding="utf-8")

Check the results.

In [10]:
check_dataframe_results(trustpilot_reviews)

Resulting dataframe has shape (2802, 7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2802 entries, 0 to 2801
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   datetime    2802 non-null   object
 1   name        2802 non-null   object
 2   rating      2802 non-null   int64 
 3   title       2802 non-null   object
 4   review      2802 non-null   object
 5   event_time  2802 non-null   object
 6   enterprise  2802 non-null   object
dtypes: int64(1), object(6)
memory usage: 153.4+ KB
None


Unnamed: 0,datetime,name,rating,title,review,event_time,enterprise
0,2023-11-13T14:03:40.000Z,Jan Winther,4,Godt fitness-center,Gennemgående er jeg godt tilfreds med mit fitn...,13. november 2023,PureGym
1,2023-11-14T13:07:20.000Z,Tina Holst,5,Syntes altid det er dejligt at komme i…,Syntes altid det er dejligt at komme i centret...,14. november 2023,PureGym
2,2023-11-13T09:22:36.000Z,Pfændtner,5,Jeg har gået i Fitness centeret i 22år…,Jeg har gået i Fitness centeret i 22år og efte...,12. november 2023,PureGym
3,2023-11-13T17:18:33.000Z,Gitte,5,Puregym Ikast,Puregym Ikast er et fantastisk center. Man føl...,13. november 2023,PureGym
4,2023-11-13T10:01:35.000Z,GITTE MIKKELSEN,2,Der mangler Stram op hold,Der mangler Stram op hold (eller ligende fx Pu...,11. november 2023,PureGym


## 1.3 Københavns Kommune Scraper

The Københavns Kommune website provides an extensive list of training facilities, both indoors and outdoors. Since this is a dynamic site built on JavaScript, the traditional webcrawler approach is not suitable, and thus we will use an approach that simulates human-like interactions using Selenium.

In [11]:
if collect:

    # Create crawler instance
    kbh_scraper = KBHFacilitiesWebScraper()
    # Get dataframe with entries
    kbh_facilities = kbh_scraper.get()

    # Save to disk
    kbh_facilities.to_csv(RAW_DATA + "kbh_facilities.csv", index=False, encoding="utf-16") # Since some Danish characters don't map to utf-8, we use utf-16
    

else:
    kbh_facilities = pd.read_csv(RAW_DATA + "kbh_facilities.csv", encoding="utf-16")

Check the results.

In [12]:
check_dataframe_results(kbh_facilities)

Resulting dataframe has shape (606, 8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606 entries, 0 to 605
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   type      606 non-null    object
 1   activity  606 non-null    object
 2   location  606 non-null    object
 3   website   606 non-null    object
 4   gender    606 non-null    object
 5   age       606 non-null    object
 6   special   606 non-null    object
 7   address   606 non-null    object
dtypes: object(8)
memory usage: 38.0+ KB
None


Unnamed: 0,type,activity,location,website,gender,age,special,address
0,gym,Styrke- og grundtræning,SOS Motion,http://www.sosmotion.dk/,both,all,,"Sundhedshus Østerbro, Randersgade 60, 4 sal, 2..."
1,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V"
2,outdoors,Kondisti,Valbyparken,,both,all,,"Tudsemindevej, 2450 Valby"
3,gym,Nærgymnastik,LOFskolen,https://lofskolen.dk/kurser/motion-og-sundhed/...,both,all,Målrettet personer der har brug for træning me...,"Østerbrogade 240, 2100 København Ø"
4,ball_sports,Floorball for kvinder 65+ år,BK Skjold,https://www.bkskjold.dk/klub/boldklubben-skjol...,women,seniors,,"Nørrebrogade 208, 2200 Kbh. N"


### 1.3.1 Lookup reviews for KBH Facilities

We observe that this dataset only contains addresses, but not geolocation (latitude and longitude) or reviews for the places. We then try to collect that missing data from the Google Maps API.

In [13]:
if collect:
    # Use custom function to iterate through the facilities and retrieve coordinates and reviews for the places.
    kbh_facilities_reviews = review_finder(gmaps, kbh_facilities)

    # Save to disk
    kbh_facilities_reviews.to_csv(RAW_DATA + "kbh_facilities.csv", index=False, encoding="utf-16") # Since some Danish characters don't map to utf-8, we use utf-16

else:
    kbh_facilities_reviews = pd.read_csv(RAW_DATA + "kbh_facilities_reviews.csv", encoding="utf-16")

Check the results.

In [14]:
check_dataframe_results(kbh_facilities_reviews)

Resulting dataframe has shape (1841, 13)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1841 entries, 0 to 1840
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   type      1841 non-null   object 
 1   activity  1841 non-null   object 
 2   location  1756 non-null   object 
 3   website   1477 non-null   object 
 4   gender    1841 non-null   object 
 5   age       1841 non-null   object 
 6   special   250 non-null    object 
 7   address   1481 non-null   object 
 8   lat       1460 non-null   float64
 9   lng       1460 non-null   float64
 10  author    1841 non-null   object 
 11  review    1841 non-null   object 
 12  rating    1841 non-null   float64
dtypes: float64(3), object(10)
memory usage: 187.1+ KB
None


Unnamed: 0,type,activity,location,website,gender,age,special,address,lat,lng,author,review,rating
0,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Ximena Ramos,This was the first time that we ordered this f...,3.0
1,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,David Olafsson,My wife and I have been coming here with our d...,5.0
2,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Rune Madsen,Amazing new Chinese food in the area. We had M...,5.0
3,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Richard Grieg Higginson,Nice food and staff,4.0
4,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Hjalte Christiansen,We ordered lunch takeaway. But they had forgot...,1.0


## 1.4 Join the dataset

We are interested in constructing a dataset that includes the enterprise, rating and review text, so we need to ensure those attributes are accesible across the different data sources.

### Extract enterprise for Google reviews

In [15]:
# Extract enterprise for Google reviews
_enterprises_ = []
# Look at each row
for ix, row in google_reviews.iterrows():
    # If not one of the main chains, default to "OTHER"
    result = "OTHER"
    # Search for the enterprise in either "type" or "name" columns
    for enterprise in gyms:
        if (enterprise.lower() in row["type"].lower()) or (enterprise.lower() in row["name"].lower()):
            result = enterprise
            break
    _enterprises_.append(result)

google_reviews["enterprise"] = _enterprises_

### Extract enterprise for KBH Facilities reviews

In [16]:
if process:
    # Extract enterprise for Google reviews
    _enterprises_ = []

    # Since some attributes are NAN, we replace them by the string "nan"
    _ = kbh_facilities_reviews.fillna("nan")

    # Look at each row
    for ix, row in _.iterrows():
        # If not one of the main chains, default to "OTHER"
        result = "OTHER"
        # Search for the enterprise in either "type" or "name" columns
        for enterprise in gyms:
            _ = kbh_facilities_reviews.fillna("nan")
            if (enterprise.lower() in row["type"].lower()) or (enterprise.lower() in row["location"].lower()):
                result = enterprise
                break
        _enterprises_.append(result)

    # Add the enterprise to the dataset
    kbh_facilities_reviews["enterprise"] = _enterprises_

    # Save the results to disk
    kbh_facilities_reviews.to_csv(PROCESSED_DATA + "kbh_facilities_reviews.csv", index=False, encoding="utf-8")

else:
    kbh_facilities_reviews = pd.read_csv(PROCESSED_DATA + "kbh_facilities_reviews.csv")

### 1.4.1 Translation of Danish reviews
Our Trustpilot dataset contains content in both English and Danish languages. We want to translate everything to english, to work with a monolingual dataset.
To accomplish the translation task, we use a translation model from Hugging-Face: Helsinki-NLP/opus-mt-da-en.

In [17]:
if process:
    # First, remove all emojis to facilitate translation
    trustpilot_reviews["review"] = trustpilot_reviews["review"].apply(lambda x: remove_emojis(x))

    # Use custom function to translate the Danish reviews
    trustpilot_reviews = translate(df = trustpilot_reviews, text_colname = "review", translation_colname="translated_review")
else:
    trustpilot_reviews = pd.read_csv(PROCESSED_DATA + "trustpilot_reviews.csv")

### 1.4.1 Translation Assesment
We assess the quality of the model's translation by computing the WER (Word error rate) metric against human translators.

In [18]:
# Translations folder
filepath = "../translations/human_translations.csv"

# We load the human translations and strip the emojis
human = pd.read_csv(filepath)
human["review"] = human.review.apply(lambda x: remove_emojis(x))
human.rename(columns={"review": "text", "translation": "human"}, inplace=True)

# We extact the model translations
machine = trustpilot_reviews[["review", "translated_review"]]
machine.rename(columns={"review": "text", "translated_review": "machine"}, inplace=True)

# We match the translations to their human counterpart
translations = human.merge(machine, on="text", how="inner")

# We split both human and machine translations
references = translations.human
predictions = translations.machine

# We use our custom function to compute the average Word error rate for the whole sample
print(f"The WER for the translations sample is {compute_WER(references, predictions):.3f}")

The WER for the translations sample is 0.393


### Select the attributes to keep

In [19]:
# Rename columns to match across datasets
google_reviews = google_reviews.rename(columns={"author_name": "author", "text": "review"})
trustpilot_reviews = trustpilot_reviews.rename(columns={"name": "author", "translated_reviews": "review"})

# Columns to keep
cols = ['enterprise', 'author', 'rating', 'review']

# Keep useful columns
google_reviews = google_reviews[cols]
trustpilot_reviews = trustpilot_reviews[cols]
kbh_facilities_reviews = kbh_facilities_reviews[cols]

# Merge all reviews
reviews = pd.concat([google_reviews, trustpilot_reviews, kbh_facilities_reviews])

Check the resulting dataframe

In [20]:
check_dataframe_results(reviews)

Resulting dataframe has shape (5003, 4)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5003 entries, 0 to 1840
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   enterprise  5003 non-null   object 
 1   author      5003 non-null   object 
 2   rating      5003 non-null   float64
 3   review      5000 non-null   object 
dtypes: float64(1), object(3)
memory usage: 195.4+ KB
None


Unnamed: 0,enterprise,author,rating,review
0,PureGym,madi sharp,4.0,"Sweet small gym, staff are kind when you see t..."
1,PureGym,Lewis Atkins,2.0,"Just a very bad gym. Staff don’t really care, ..."
2,PureGym,Eric,1.0,"terrible facilities\nbathrooms are gross, dirt..."
3,PureGym,Rune Perstrup,1.0,An Unhygienic Coronavirus Petri Dish.\n\nI hav...
4,PureGym,Mario Piazza,1.0,In a huge gym there is only one hair dryer and...


# 2. Annotations

[TODO: ADD DESCRIPTION OF WHAT WE WANT TO ACHIEVE WITH THIS]

## 2.1 Annotation distribution
To avoid introducing bias to the task, we remove all columns except for the text to annotate, and we randomly distribute the samples across annotators.

In [21]:
# Join both datasets
review_text = pd.DataFrame(reviews["review"])

# Shuffle reviews
review_text = review_text.sample(frac=1)

# Give unique ID to reviews
review_text["ID"] = np.arange(1, len(reviews)+1)

# Drop the index
review_text.reset_index(drop=True, inplace=True)

# Size of sample annotated by all annotators
size = 100

# Keep a list of not assigned IDs
remaining_ids = list(review_text.ID)

# Randomly select some IDs
common_ids =np.random.choice(remaining_ids, size=size, replace=False)
# Assign those instances to "all" annotators
review_text.loc[review_text.ID.isin(common_ids), "annotator"] = "all"
# Remove the selected IDs from the remaining not assigned IDs
remaining_ids = [x for x in remaining_ids if x not in common_ids]

# List of annotators
annotators = ["Bogdan", "Chrisanna", "Christian", "Gino", "Veron"]

# Size of the samples
size = 600
# Assign to each annotator
for a in annotators:
    # Randomly select some IDs
    selected_ids = np.random.choice(remaining_ids, size=size, replace=False)
    # Assign those instances to the specific annotator
    review_text.loc[review_text.ID.isin(selected_ids), "annotator"] = a
    # Remove the selected IDs from the remaining not assigned IDs
    remaining_ids = [x for x in remaining_ids if x not in selected_ids]

# Show number of instances per annotator
display(review_text.groupby("annotator").size())

annotator
Bogdan       600
Chrisanna    600
Christian    600
Gino         600
Veron        600
all          100
dtype: int64

We can now distribute the samples to annotate across annotators.

In [22]:
if process:
    # For each annotator, create a file
    for a in annotators:
        # Get the annotations for the specific annotator
        annotators_sample = reviews.loc[(reviews.annotator == a) | (reviews.annotator == "all"), ["ID", "text"]]
        annotators_sample.to_csv(ANNOTATIONS_DATA + f"annotators_samples/{a}.csv", index=False)

## 2.2 Load the annotation responses

In [38]:
# Container for individual annotation responses datasets
dfs = []

# Look at the JSON files, parse and join
for file in os.listdir(ANNOTATIONS_DATA + "annotators_results"):
    if file.endswith(".json"):
        # Use our custom function to parse the response file
        print(file)
        df = parse_label_studio_file(ANNOTATIONS_DATA + "annotators_results/" + file)
        # Append to the container
        dfs.append(df)

# Join all files
annotations = pd.concat(dfs).reset_index(drop=True)

print(f"A total of {annotations.shape[0]} are now joined.")

bogdan_annotated.json
Christian_annotations.json
Gino_annotations.json
sanna.json
veron_hoxha.json
A total of 1008 are now joined.


## 2.3 Calculate IAA
To assess the reliability of the annotations we calculate Fleiss' kappa inter-annotator agreement.

In [1]:
# The categories are in the columns (except the first two: "ID" and "text")
categories = annotations.columns[3:]
# The possible labels are 1.0 (Positive), 0.0 (Neutral), -1.0 (Negative) or NAN (if no sentiment)
labels = [1.0, 0.0, -1.0, np.nan]

IAA = fleiss_kappa(annotations, categories, labels=[1.0])

print(f"The Fleiss Kappa for IAA is {IAA:.2f}.")

NameError: name 'annotations' is not defined