This notebook is to process sentiment analysis on the reviews of airbnb location reviews.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
city_names = ["tokyo", "sydney", "melbourne", "singapore", "hongkong", "taipei", "bangkok"]

In [None]:
cities_df = dict()
for city in city_names:
    cities_df[city] = pd.read_csv("./data/listings_{c}.csv".format(c=city))

In [None]:
cities_review_df = dict()
for city in city_names:
    cities_review_df[city] = pd.read_csv("./data/reviews_{c}.csv".format(c=city))

In [None]:
import pandas as pd
from transformers import pipeline, DistilBertTokenizer

def get_sentiment(text, classifier):
    try:
        tokens = tokenizer.tokenize(text)

        if len(tokens) > 510:  # 512 - 2 (for [CLS] and [SEP])
            tokens = tokens[:510]

        truncated_text = tokenizer.convert_tokens_to_string(tokens)

        results = classifier(truncated_text)
        sentiment = dict()
        for result in results[0]:
            sentiment[result["label"]] = result["score"]

        return sentiment

    except Exception as e:
        print(f"Error: {e}")
        return {
            "positive": float("NaN"),
            "negative": float("NaN"),
            "neutral": float("NaN")
        }

classifier = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", return_all_scores=True)
tokenizer = DistilBertTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")

sentiment_result = [] # List of dictionaries containing the sentiment result
for i, comment in enumerate(cities_review_df['tokyo']['comments']):
    if i % 10000 == 0:
        print(f"Processing comment {i+1} out of {len(cities_review_df['tokyo']['comments'])}")
    sentiment = get_sentiment(comment, classifier)
    sentiment_result.append(sentiment)

# Update the dataframe with the sentiment result
for i, result in enumerate(sentiment_result):
    cities_review_df["tokyo"].loc[i, "positive"] = result["positive"]
    cities_review_df["tokyo"].loc[i, "negative"] = result["negative"]
    cities_review_df["tokyo"].loc[i, "neutral"] = result["neutral"]

# Create a csv
cities_review_df["tokyo"].to_csv("./data/reviews_tokyo_sentiment.csv", index=False)


In [None]:
print(cities_review_df["tokyo"].head())
