# Create Annotation Datasets Notebook

### The purpose of this notebook is to join all review datasets from different sources and generate samples datasets for annotation task.

Imports

In [27]:
import pandas as pd
import numpy as np

Paths

In [28]:
data_path = "../data/"

google_reviews_filename = "google_reviews.csv"
trustpilot_reviews_filename = "trustpilot_reviews_translated.csv"

Read review files

In [29]:
google_reviews = pd.read_csv(data_path + google_reviews_filename)
trustpilot_reviews = pd.read_csv(data_path + trustpilot_reviews_filename, encoding="utf-16")

print(f"Google reviews has shape {google_reviews.shape} and Trustpilot reviews has shape {trustpilot_reviews.shape}")

Google reviews has shape (405, 16) and Trustpilot reviews has shape (221, 8)


Inspect both datasets

In [30]:
display(google_reviews.head())
display(trustpilot_reviews.head())

Unnamed: 0,place_id,type,name,lat,lng,author_name,rating,text,opening_hours,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,ChIJnebu2e7fTEYRl2jmqFgqVZo,PureGym Odense,PureGym,55.394857,10.366625,Lea Hansen,5,Can't complain. They do their job very well an...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05...",05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 11:59PM,05:00AM - 12:00AM,05:00AM - 12:00AM
1,ChIJnebu2e7fTEYRl2jmqFgqVZo,PureGym Odense,PureGym,55.394857,10.366625,Marcus Nygård,2,Big gym spanning 3 floors. Good selection of b...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05...",05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 11:59PM,05:00AM - 12:00AM,05:00AM - 12:00AM
2,ChIJnebu2e7fTEYRl2jmqFgqVZo,PureGym Odense,PureGym,55.394857,10.366625,Maj Vangsø Simonsen,5,Love this gym! Best instructors and facilities...,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05...",05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 11:59PM,05:00AM - 12:00AM,05:00AM - 12:00AM
3,ChIJnebu2e7fTEYRl2jmqFgqVZo,PureGym Odense,PureGym,55.394857,10.366625,Henrik Lambert,4,Fine place but watch your parking meter.,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05...",05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 11:59PM,05:00AM - 12:00AM,05:00AM - 12:00AM
4,ChIJnebu2e7fTEYRl2jmqFgqVZo,PureGym Odense,PureGym,55.394857,10.366625,Fresh Andrew,5,Lots of space and it's nice and clean,"{'Monday': '05:00AM - 12:00AM', 'Tuesday': '05...",05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 12:00AM,05:00AM - 11:59PM,05:00AM - 12:00AM,05:00AM - 12:00AM


Unnamed: 0,datetime,name,rating,title,review,event_time,enterprise,translated_reviews
0,2023-11-02T13:26:25.000Z,Sally Devantier,3,Priserne stiger,"Priserne stiger, mens centrene forfalder mere ...",02. november 2023,PureGym,Prices are rising while the centres are fallin...
1,2023-10-28T12:03:29.000Z,judith,4,Dejlig imødekommende personale,"Dejlig imødekommende personale, som altid form...",27. oktober 2023,PureGym,Nice welcoming staff who always manage to keep...
2,2023-11-02T16:08:10.000Z,Mila gouzi,5,Søde rar unge mennesker der arbejder,"Søde rar unge mennesker der arbejder, de har e...",02. november 2023,PureGym,"Sweet nice young people working, they have a s..."
3,2023-10-31T18:53:42.000Z,Franck Tronborg,5,Har Trænet i mangen år her får mest ud af det ...,"Har Trænet i mangen år her og Bornholmsvej, er...",31. oktober 2023,PureGym,Has trained for many years here and Bornholmsv...
4,2023-10-27T18:15:58.000Z,Esben Holst Christensen,4,Udmærket center. Value for money,Udmærket center med god sammenhæng mellem pris...,27. oktober 2023,PureGym,Excellent center with good connection between ...


Upon inspection, we see that the only attributes in common are the reviewer's name (author_name, name), rating, review text (text, review) and enterprise (name, enterprise).

In [31]:
# Keep relevant columns
google_reviews = google_reviews[["name", "author_name", "rating", "text"]]
trustpilot_reviews = trustpilot_reviews[["enterprise", "name", "rating", "translated_reviews"]]

# Rename columns to match in both datasets
google_reviews.rename(columns={"name": "enterprise", "author_name": "reviewer"}, inplace=True)
trustpilot_reviews.rename(columns={"name": "reviewer", "translated_reviews": "text"}, inplace=True)

# Drop duplicates
google_reviews.drop_duplicates(inplace=True)
trustpilot_reviews.drop_duplicates(inplace=True)

We can now join both datasets.

In [32]:
# Join both datasets
reviews = pd.concat([google_reviews, trustpilot_reviews]).reset_index(drop=True)

# Give unique ID to reviews
reviews["ID"] = np.arange(1, len(reviews)+1)

Now it's time to split the dataset for the different annotators.

One batch will be annotated by all annotators, to calculate the inter-annotator agreement. Then we will sample equal number of instances for each annotator.

In [33]:
# Size of sample annotated by all annotators
size = 100

# Keep a list of not assigned IDs
remaining_ids = list(reviews.ID)

# Randomly select some IDs
common_ids = np.random.choice(remaining_ids, size=size, replace=False)
# Assign those instances to "all" annotators
reviews.loc[reviews.ID.isin(common_ids), "annotator"] = "all"
# Remove the selected IDs from the remaining not assigned IDs
remaining_ids = [x for x in remaining_ids if x not in common_ids]

# List of annotators
annotators = ["Bogdan", "Chrisanna", "Christian", "Gino", "Veron"]
# Size of the samples
size = len(remaining_ids) // len(annotators)
# Assign to each annotator
for a in annotators:
    # Randomly select some IDs
    selected_ids = np.random.choice(remaining_ids, size=size, replace=False)
    # Assign those instances to the specific annotator
    reviews.loc[reviews.ID.isin(selected_ids), "annotator"] = a
    # Remove the selected IDs from the remaining not assigned IDs
    remaining_ids = [x for x in remaining_ids if x not in selected_ids]


We can check that all annotators received a similar number of instances.

In [34]:
# Show number of instances per annotator
display(reviews.groupby("annotator").size())

annotator
Bogdan       101
Chrisanna    101
Christian    101
Gino         101
Veron        101
all          100
dtype: int64

Generate files for annotators with only ID and text, as to avoid introducing any bias.

In [38]:
for a in annotators:

    annotators_sample = reviews.loc[(reviews.annotator == a) | (reviews.annotator == "all"), ["ID", "text"]]
    #annotators_sample.to_csv(f"{a}.csv", encoding="utf-8", index=False)

Bogdan (201, 2)
Chrisanna (201, 2)
Christian (201, 2)
Gino (201, 2)
Veron (201, 2)
