# < Meaningful title >

##### Imports

In [1]:
import googlemaps
import pandas as pd
import numpy as np

# Custom util functions
import sys; sys.path.append("./libraries/")
from utils import *

### Settings

##### Reproducibility settings

In [2]:
# Random seed
np.random.seed = 7

# Relative Paths
raw_data = "../data/raw_data/"
process_data = "../data/process_data"
annotations_data = "../data/annotations"

# Flags
collect = False # Flag to collect data or load existent raw_data
process = False # Flag to process data or load existent processed data

##### Google API

In [3]:
key = open("./Google_API_key.txt").readline()
gmaps = googlemaps.Client(key=key)

# 1. Data Collection

We start by creating a list of query values that relate to the dataset. We are interested in getting mostly reviews (and some other metadata) on specific fitness facilities (i.e. popular chains) from main cities in Denmark. To do this, we will compute the query list as a combination of cities and fitness chains. 

In [4]:
# List of cities
cities = ['Copenhagen', 'Aalborg', 'Arhus', 'Odense']
 
# Popular fitness chains
gyms = ["PureGym", "SATS", "Vesterbronx"]

# Query list
query_list = [g + " " + c for g in gyms for c in cities]

print(query_list)

['PureGym Copenhagen', 'PureGym Aalborg', 'PureGym Arhus', 'PureGym Odense', 'SATS Copenhagen', 'SATS Aalborg', 'SATS Arhus', 'SATS Odense', 'Vesterbronx Copenhagen', 'Vesterbronx Aalborg', 'Vesterbronx Arhus', 'Vesterbronx Odense']


## 1.1 Google Maps API

The Google maps API takes a single query string to search for results (similar to the User Interface searchbox). Therefore, we combine popular fitness facilities with main Danish cities as our query keys.

### 1.1.1 Reviews
We start by getting the reviews for our query list.

Get responses for all the queries from the API

In [5]:
if collect:
    # Get response for queries
    dfs = []

    # For each query in the query list
    for query in query_list:  
        # Get the response using our custom made querier
        dfs.append(google_querier(gmaps, query))

    google_reviews = pd.concat(dfs)

    # Save to disk
    google_reviews.to_csv(raw_data + "google_reviews.csv", index=False, encoding="utf-8")

else:
    google_reviews = pd.read_csv(raw_data + "google_reviews.csv")

Check the results.

In [6]:
check_dataframe_results(google_reviews)

Resulting dataframe has shape (360, 9)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   place_id       360 non-null    object 
 1   type           360 non-null    object 
 2   name           360 non-null    object 
 3   lat            360 non-null    float64
 4   lng            360 non-null    float64
 5   author_name    360 non-null    object 
 6   rating         360 non-null    int64  
 7   text           360 non-null    object 
 8   opening_hours  360 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 25.4+ KB
None


Unnamed: 0,place_id,type,name,lat,lng,author_name,rating,text,opening_hours
0,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,madi sharp,4,"Sweet small gym, staff are kind when you see t...","{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
1,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Lewis Atkins,2,"Just a very bad gym. Staff don’t really care, ...","{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
2,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Eric,1,"terrible facilities\nbathrooms are gross, dirt...","{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
3,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Rune Perstrup,1,An Unhygienic Coronavirus Petri Dish.\n\nI hav...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."
4,ChIJh3mB6UxSUkYREbiH4JDK-7M,PureGym Copenhagen,PureGym,55.669812,12.54739,Mario Piazza,1,In a huge gym there is only one hair dryer and...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05..."


### 1.1.2 Nearby Transportation
We are interested in collecting the nearby transportation to the fitness centers.

In [7]:
if collect:

    # Radius of search in meters
    radius = 500

    # Transportation type key (similar to what one would input in Google Maps search box)
    transportation_type = ['bus_station', 'train_station', 'transit_station'] # Avaliable transportation: only bus station, train station and transit station (which includes metro)

    # Container
    nearby_transportation = []

    # We iterate through all our fitness centers, and retrieve nearby transportations
    for ix, row in google_reviews.iterrows():
        # Extract info from fitness center
        place_id = row.place_id
        location = {"lat": row.lat, "lng": row.lng}
        # Look at nearby transportation
        df = google_nearby(gmaps, place_id = place_id, keys = transportation_type, location = location, radius = radius)
        # Append results
        nearby_transportation.append(df)

    # Join all results
    nearby_transportation = pd.concat(nearby_transportation)

    # Save to disk
    nearby_transportation.to_csv(raw_data + "transportation.csv", index=False, encoding="utf-8")

else:
    nearby_transportation = pd.read_csv(raw_data + "transportation.csv")

Check the results.

In [8]:
check_dataframe_results(nearby_transportation)

Resulting dataframe has shape (6195, 7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   place_id                6195 non-null   object 
 1   transport_id            6195 non-null   object 
 2   transport_name          6195 non-null   object 
 3   transport_type          6195 non-null   object 
 4   transport_lat           6195 non-null   float64
 5   transport_lng           6195 non-null   float64
 6   distance_gym_transport  6195 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 338.9+ KB
None


Unnamed: 0,place_id,transport_id,transport_name,transport_type,transport_lat,transport_lng,distance_gym_transport
0,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJ-y92w3VTUkYRY8NOJNuwQkQ,Kridt v/Rikke Frisk,bus_station,55.668036,12.551084,305
1,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJ76k9hJ9TUkYRRaHYcLRX3XE,Lysholdet v/Jakob Holst,bus_station,55.672786,12.547279,331
2,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJydMagp9TUkYRjfgBz2sKErQ,"Ejerforeningen Sigbrits Allé 3, 5 og 5a",bus_station,55.672865,12.546713,343
3,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJydMagp9TUkYRcgSv8LkczNs,Grundejerforeningen Carl Jacobsens Vej 33-41,bus_station,55.672865,12.546713,343
4,ChIJh3mB6UxSUkYREbiH4JDK-7M,ChIJl46JnZ5TUkYRnW93t25gFhQ,Optiperform v/Maja Juel-Hansen,bus_station,55.668011,12.542917,346


## 1.2 Trustpilot WebScraper

Trustpilot is a Danish consumer review website very popular in Denmark. It is publicly available and easy to access, but it does not provide any API integration. Therefore, we use a simple webcrawler to extract the reviews of interest.

In [9]:
if collect:
    dfs = []

    # Reuse the gyms
    for g in gyms:
        df = trustpilot_crawler(key=g, verbose=False)

        # Append the facility DF to main df
        dfs.append(df)

    # Join all DFs
    trustpilot_reviews = pd.concat(dfs)

    # Save to disk
    trustpilot_reviews.to_csv(raw_data + "trustpilot_reviews.csv", index=False, encoding="utf-8")

else:
    trustpilot_reviews = pd.read_csv(raw_data + "trustpilot_reviews.csv", encoding="utf-8")

Check the results.

In [10]:
check_dataframe_results(trustpilot_reviews)

Resulting dataframe has shape (2802, 7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2802 entries, 0 to 2801
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   datetime    2802 non-null   object
 1   name        2802 non-null   object
 2   rating      2802 non-null   int64 
 3   title       2802 non-null   object
 4   review      2802 non-null   object
 5   event_time  2802 non-null   object
 6   enterprise  2802 non-null   object
dtypes: int64(1), object(6)
memory usage: 153.4+ KB
None


Unnamed: 0,datetime,name,rating,title,review,event_time,enterprise
0,2023-11-13T14:03:40.000Z,Jan Winther,4,Godt fitness-center,Gennemgående er jeg godt tilfreds med mit fitn...,13. november 2023,PureGym
1,2023-11-14T13:07:20.000Z,Tina Holst,5,Syntes altid det er dejligt at komme i…,Syntes altid det er dejligt at komme i centret...,14. november 2023,PureGym
2,2023-11-13T09:22:36.000Z,Pfændtner,5,Jeg har gået i Fitness centeret i 22år…,Jeg har gået i Fitness centeret i 22år og efte...,12. november 2023,PureGym
3,2023-11-13T17:18:33.000Z,Gitte,5,Puregym Ikast,Puregym Ikast er et fantastisk center. Man føl...,13. november 2023,PureGym
4,2023-11-13T10:01:35.000Z,GITTE MIKKELSEN,2,Der mangler Stram op hold,Der mangler Stram op hold (eller ligende fx Pu...,11. november 2023,PureGym


## 1.3 Københavns Kommune Scraper

The Københavns Kommune website provides an extensive list of training facilities, both indoors and outdoors. Since this is a dynamic site built on JavaScript, the traditional webcrawler approach is not suitable, and thus we will use an approach that simulates human-like interactions using Selenium.

In [11]:
if collect:

    # Create crawler instance
    kbh_scraper = KBHFacilitiesWebScraper()
    # Get dataframe with entries
    kbh_facilities = kbh_scraper.get()

    # Save to disk
    kbh_facilities.to_csv(raw_data + "kbh_facilities.csv", index=False, encoding="utf-16") # Since some Danish characters don't map to utf-8, we use utf-16
    

else:
    kbh_facilities = pd.read_csv(raw_data + "kbh_facilities.csv", encoding="utf-16")

Check the results.

In [12]:
check_dataframe_results(kbh_facilities)

Resulting dataframe has shape (606, 8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606 entries, 0 to 605
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   type      606 non-null    object
 1   activity  606 non-null    object
 2   location  606 non-null    object
 3   website   606 non-null    object
 4   gender    606 non-null    object
 5   age       606 non-null    object
 6   special   606 non-null    object
 7   address   606 non-null    object
dtypes: object(8)
memory usage: 38.0+ KB
None


Unnamed: 0,type,activity,location,website,gender,age,special,address
0,gym,Styrke- og grundtræning,SOS Motion,http://www.sosmotion.dk/,both,all,,"Sundhedshus Østerbro, Randersgade 60, 4 sal, 2..."
1,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V"
2,outdoors,Kondisti,Valbyparken,,both,all,,"Tudsemindevej, 2450 Valby"
3,gym,Nærgymnastik,LOFskolen,https://lofskolen.dk/kurser/motion-og-sundhed/...,both,all,Målrettet personer der har brug for træning me...,"Østerbrogade 240, 2100 København Ø"
4,ball_sports,Floorball for kvinder 65+ år,BK Skjold,https://www.bkskjold.dk/klub/boldklubben-skjol...,women,seniors,,"Nørrebrogade 208, 2200 Kbh. N"


### 1.3.1 Lookup reviews for KBH Facilities

We observe that this dataset only contains addresses, but not geolocation (latitude and longitude) or reviews for the places. We then try to collect that missing data from the Google Maps API.

In [13]:
if collect:
    # Use custom function to iterate through the facilities and retrieve coordinates and reviews for the places.
    kbh_facilities_reviews = review_finder(gmaps, kbh_facilities)

    # Save to disk
    kbh_facilities_reviews.to_csv(raw_data + "kbh_facilities.csv", index=False, encoding="utf-16") # Since some Danish characters don't map to utf-8, we use utf-16

else:
    kbh_facilities_reviews = pd.read_csv(raw_data + "kbh_facilities_reviews.csv", encoding="utf-16")

Check the results.

In [14]:
check_dataframe_results(kbh_facilities_reviews)

Resulting dataframe has shape (1841, 13)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1841 entries, 0 to 1840
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   type      1841 non-null   object 
 1   activity  1841 non-null   object 
 2   location  1756 non-null   object 
 3   website   1477 non-null   object 
 4   gender    1841 non-null   object 
 5   age       1841 non-null   object 
 6   special   250 non-null    object 
 7   address   1481 non-null   object 
 8   lat       1460 non-null   float64
 9   lng       1460 non-null   float64
 10  author    1841 non-null   object 
 11  review    1841 non-null   object 
 12  rating    1841 non-null   float64
dtypes: float64(3), object(10)
memory usage: 187.1+ KB
None


Unnamed: 0,type,activity,location,website,gender,age,special,address,lat,lng,author,review,rating
0,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Ximena Ramos,This was the first time that we ordered this f...,3.0
1,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,David Olafsson,My wife and I have been coming here with our d...,5.0
2,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Rune Madsen,Amazing new Chinese food in the area. We had M...,5.0
3,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Richard Grieg Higginson,Nice food and staff,4.0
4,outdoors,Træningspavillion,,,both,all,,"Kvægtorvsgade, 1710 KBH V",55.669719,12.56313,Hjalte Christiansen,We ordered lunch takeaway. But they had forgot...,1.0


## 1.4 Join the dataset

In [15]:
google_reviews.columns

Index(['place_id', 'type', 'name', 'lat', 'lng', 'author_name', 'rating',
       'text', 'opening_hours'],
      dtype='object')

### Extract enterprise for Google reviews

In [16]:
# Extract enterprise for Google reviews
_enterprises_ = []
# Look at each row
for ix, row in google_reviews.iterrows():
    # If not one of the main chains, default to "OTHER"
    result = "OTHER"
    # Search for the enterprise in either "type" or "name" columns
    for enterprise in gyms:
        if (enterprise.lower() in row["type"]) or (enterprise.lower() in row["name"]):
            result = enterprise
            break
    _enterprises_.append(result)

google_reviews["enterprise"] = _enterprises_

### Extract enterprise for KBH Facilities reviews

In [17]:
# Extract enterprise for Google reviews
_enterprises_ = []
# Look at each row
for ix, row in kbh_facilities_reviews.iterrows():
    # If not one of the main chains, default to "OTHER"
    result = "OTHER"
    # Search for the enterprise in either "type" or "name" columns
    for enterprise in gyms:
        if (enterprise.lower() in row["type"].lower()) or (enterprise.lower() in row["name"].lower()):
            result = enterprise
            break
    _enterprises_.append(result)

kbh_facilities_reviews["enterprise"] = _enterprises_

KeyError: 'name'

### Select the attributes to keep

In [None]:
kbh_facilities_reviews.loc[60]

type                                      ball_sports
activity                                       Volley
location                    Idrætsforeningen Kæmperne
website                 https://ifk98.dk/default.aspx
gender                                           both
age                                               all
special                                           NaN
address     Nørrebrohallen, Bragesgade 5, 2200 Kbh. N
lat                                          55.69917
lng                                         12.543148
author                                    Asger Bobek
review        Great association. Show up and join in!
rating                                            5.0
Name: 60, dtype: object

In [None]:
test = kbh_facilities_reviews.fillna("")
test[test["location"].str.contains("SATS")]

Unnamed: 0,type,activity,location,website,gender,age,special,address,lat,lng,author,review,rating
546,fitness,"Hold-, styrke- og konditionstræning",SATS,https://www.sats.dk/fitnesscenter/kbh-norrebro...,both,all,,"Frederikssundsvej 5, 2400 Kbh. NV",55.700693,12.536742,Sergey Kovalyov,"Probably, my favorite SATS in Copenhagen. You ...",4.0
547,fitness,"Hold-, styrke- og konditionstræning",SATS,https://www.sats.dk/fitnesscenter/kbh-norrebro...,both,all,,"Frederikssundsvej 5, 2400 Kbh. NV",55.700693,12.536742,Bruno Miguel de Matos Esteves casimiro,Quiet a nice place to workout huge windows wi...,5.0
548,fitness,"Hold-, styrke- og konditionstræning",SATS,https://www.sats.dk/fitnesscenter/kbh-norrebro...,both,all,,"Frederikssundsvej 5, 2400 Kbh. NV",55.700693,12.536742,PC CP,The view towards the canal in this gym is one ...,4.0
549,fitness,"Hold-, styrke- og konditionstræning",SATS,https://www.sats.dk/fitnesscenter/kbh-norrebro...,both,all,,"Frederikssundsvej 5, 2400 Kbh. NV",55.700693,12.536742,Lea RU,The view is amazing. I am satisfied with the c...,3.0
550,fitness,"Hold-, styrke- og konditionstræning",SATS,https://www.sats.dk/fitnesscenter/kbh-norrebro...,both,all,,"Frederikssundsvej 5, 2400 Kbh. NV",55.700693,12.536742,Esa Toivonen,"Sauna 😊 Could be slightly warmer, but better t...",4.0
583,fitness,Fitnesscenter,SATS,https://www.sats.dk/fitnesscenter/kbh-sydhavn,both,all,,"Scandiagade 15, 2450 Kbh. SV.",55.653006,12.542614,Sergey Kovalyov,"Probably, my favorite SATS in Copenhagen. You ...",4.0
584,fitness,Fitnesscenter,SATS,https://www.sats.dk/fitnesscenter/kbh-sydhavn,both,all,,"Scandiagade 15, 2450 Kbh. SV.",55.653006,12.542614,Bruno Miguel de Matos Esteves casimiro,Quiet a nice place to workout huge windows wi...,5.0
585,fitness,Fitnesscenter,SATS,https://www.sats.dk/fitnesscenter/kbh-sydhavn,both,all,,"Scandiagade 15, 2450 Kbh. SV.",55.653006,12.542614,PC CP,The view towards the canal in this gym is one ...,4.0
586,fitness,Fitnesscenter,SATS,https://www.sats.dk/fitnesscenter/kbh-sydhavn,both,all,,"Scandiagade 15, 2450 Kbh. SV.",55.653006,12.542614,Lea RU,The view is amazing. I am satisfied with the c...,3.0
587,fitness,Fitnesscenter,SATS,https://www.sats.dk/fitnesscenter/kbh-sydhavn,both,all,,"Scandiagade 15, 2450 Kbh. SV.",55.653006,12.542614,Esa Toivonen,"Sauna 😊 Could be slightly warmer, but better t...",4.0


In [None]:
# Rename columns to match across datasets
google_reviews = google_reviews.rename(columns={"author_name": "author", "text": "review"})
trustpilot_reviews = trustpilot_reviews.rename(columns={"name": "author", "translated_reviews": "review"})
kbh_facilities_reviews = kbh_facilities_reviews.rename(columns={""})
google_reviews = google_reviews[[]]

### Prepare different datasets attributes

# 2. Annotations

[TODO: ADD DESCRIPTION OF WHAT WE WANT TO ACHIEVE WITH THIS]

## 2.1 Translation of Danish reviews
Our Trustpilot dataset contains content in both English and Danish languages. We want to translate everything to english, to work with a monolingual dataset.
To accomplish the translation task, we use a translation model from Hugging-Face: Helsinki-NLP/opus-mt-da-en.

In [18]:
# First, remove all emojis to facilitate translation
trustpilot_reviews["review"] = trustpilot_reviews["review"].apply(lambda x: remove_emojis(x))

test = translate(df = trustpilot_reviews, text_colname = "review", translation_colname="translated_review")

99.96%

In [26]:
test.loc[1, "translated_review"]

'an'

In [21]:
test.to_csv("trustpilot_reviews_translated-utf8.csv", index=False, encoding="utf-8")

In [None]:
# First, remove all emojis to facilitate translation
trustpilot_reviews["review"] = trustpilot_reviews["review"].apply(lambda x: remove_emojis(x))

# Translate the reviews
test = translate(df = trustpilot_reviews, text_colname = "review", translation_colname="translated_review")
# NOTE: It takes several minutes to run ()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\ginof\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\ginof\AppData\Local\Temp\ipykernel_3452\3819103685.py", line 5, in <module>
    test = translate(df = trustpilot_reviews, text_colname = "review", translation_colname="translated_review")
  File "c:\Users\ginof\OneDrive - ITU\Documents\GitHub\data-wild-west\code\./libraries\utils.py", line 588, in translate
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\series.py", line 4357, in apply
    return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\apply.py", line 1043, in apply
    return self.apply_standard()
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\apply.py",

## 2.2 Annotation distribution
To avoid introducing bias to the task, we remove all columns except for the text to annotate, and we randomly distribute the samples across annotators.

In [None]:
# Join both datasets
reviews = pd.concat([google_reviews[["text"]], trustpilot_reviews[["translated_reviews"]]]).reset_index(drop=True)

# Give unique ID to reviews
reviews["ID"] = np.arange(1, len(reviews)+1)

# Size of sample annotated by all annotators
size = 100

# Keep a list of not assigned IDs
remaining_ids = list(reviews.ID)

# Randomly select some IDs
common_ids =np.random.choice(remaining_ids, size=size, replace=False)
# Assign those instances to "all" annotators
reviews.loc[reviews.ID.isin(common_ids), "annotator"] = "all"
# Remove the selected IDs from the remaining not assigned IDs
remaining_ids = [x for x in remaining_ids if x not in common_ids]

# List of annotators
annotators = ["Bogdan", "Chrisanna", "Christian", "Gino", "Veron"]

# Size of the samples
size = len(remaining_ids) // len(annotators)
# Assign to each annotator
for a in annotators:
    # Randomly select some IDs
    selected_ids = np.random.choice(remaining_ids, size=size, replace=False)
    # Assign those instances to the specific annotator
    reviews.loc[reviews.ID.isin(selected_ids), "annotator"] = a
    # Remove the selected IDs from the remaining not assigned IDs
    remaining_ids = [x for x in remaining_ids if x not in selected_ids]

# Show number of instances per annotator
display(reviews.groupby("annotator").size())

In [None]:
for a in annotators:

    annotators_sample = reviews.loc[(reviews.annotator == a) | (reviews.annotator == "all"), ["ID", "text"]]

In [None]:
# JSON formatted responses path
path = "../Annotations/Responses/"

# Container for individual annotation responses datasets
dfs = []

# Look at the JSON files, parse and join
for file in i.os.listdir(path):
    if file.endswith(".json"):
        # Use our custom function to parse the response file
        df = i.parse_file(path + file)
        # Append to the container
        dfs.append(df)

# Join all files
annotations = i.pd.concat(dfs).reset_index(drop=True)

print(f"A total of {annotations.shape[0]} are now joined.")