# Yelp Business Reviews - Sentiment Analysis

## Importing Libraries (will be edited)

In [49]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
#import findspark
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import IDF, StopWordsRemover, Tokenizer, VectorAssembler, CountVectorizer
from pyspark.ml.linalg import Vector
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, length, lit, lower, regexp_replace, size, split, filter, udf
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.functions import when
from pyspark.sql.types import ArrayType,StringType
from pyspark.ml.classification import LogisticRegression

## Creating Spark Session

In [39]:
""" UNCOMMENT WHEN RUNNING ON LOCAL !!!
# Create a spark session
findspark.init()
spark = SparkSession.builder \
    .appName("Yelp_Sentiment_Analysis") \
    .master("local[8]") \
    .config("spark.executor.memory", "10g") \
    .config("spark.executor.cores", "3") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

### Patlarsa dene
#.config("spark.memory.offHeap.enabled","true") 
#.config("spark.memory.offHeap.size","10g")
"""

In [2]:
# Get config
conf = spark.sparkContext.getConf()

# Print the configuration settings
print("spark.app.name = ", conf.get("spark.app.name"))
print("spark.master = ", conf.get("spark.master"))
print("spark.executor.memory = ", conf.get("spark.executor.memory"))
print("spark.executor.cores = ", conf.get("spark.executor.cores")),
print("spark.driver.memory = ", conf.get("spark.driver.memory"))

In [41]:
"""
# Source file pathways on local disk
path_business = "../yelp_academic_dataset_business.json"
path_review = "../yelp_academic_dataset_review.json"
positive_path = "../positive_words.txt"
negative_path = "../negative_words.txt"
"""

In [3]:

# Source file pathways on ABFSS
path_business = "abfss://unity-catalog-storage@dbstoragepwtbcgut4qtn6.dfs.core.windows.net/yelp_academic_dataset_business.json"
path_review = "abfss://unity-catalog-storage@dbstoragepwtbcgut4qtn6.dfs.core.windows.net/yelp_academic_dataset_review.json"
positive_path = "abfss://unity-catalog-storage@dbstoragepwtbcgut4qtn6.dfs.core.windows.net/positive_words.txt"
negative_path = "abfss://unity-catalog-storage@dbstoragepwtbcgut4qtn6.dfs.core.windows.net/negative_words.txt"


## Preprocessing of Business Dataset

In [4]:
# Import business data
business = spark.read.json(path_business)

In [44]:
# Print first 10 rows
business.show(n=10, truncate=10)

In [5]:
# Drop irrelevant columns
business = business.drop("hours").drop("attributes")

In [46]:
# Print current dataset schema
business.printSchema()

In [6]:
# Filter out rows with null 'categories'
business = business.filter(col("categories").isNotNull())

In [9]:
# Filter for businesses in US states
states = [ "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",\
 "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", ]

usa = business.filter(col("state").isin(states))

In [10]:
# Print out the total number of businesses in dataset
print(f"Number of US businesses in the dataset: {usa.count()}")

In [11]:
# Filter restaurants and print out the total number
us_restaurants = usa.filter(usa["categories"].contains("Restaurants"))
print(f"Number of total US restaurants in the dataset: {us_restaurants.count()}")

In [12]:
# Label every row with their respective type of cuisine
us_restaurants = us_restaurants.withColumn(
    "category",
    when(col("categories").contains("Korean"), "Korean")
    .when(col("categories").contains("Thai"), "Thai")
    .when(col("categories").contains("French"), "French")
    .when(col("categories").contains("Greek"), "Greek")
    .when(col("categories").contains("Indian"), "Indian")
    .when(col("categories").contains("Hawaiian"), "Hawaiian")
    .when(col("categories").contains("African"), "African")
    .when(col("categories").contains("Spanish"), "Spanish")
)

In [13]:
# Drop the old category column and filter out null categories
us_restaurants = us_restaurants.drop("categories")
us_restaurants = us_restaurants.filter(col("category").isNotNull())

In [14]:
# Print out the number of restaurants that are labeled by their cuisine types
print(f"Number of labeled US restaurants in the dataset: {us_restaurants.count()}")

In [54]:
# Check if the dataset contains any duplicated businesses
us_restaurants.groupBy("business_id").count().filter(col("count") > 1).count()

## Preprocessing of Review Dataset

In [15]:
# Import review data
review = spark.read.json(path_review)

In [16]:
# Print first 10 rows
review.show(n=10, truncate=20)

In [57]:
# Print current dataset schema
review.printSchema()

## Merging Business and Review Datasets

In [17]:
# Change the names of 'stars' columns in both datasets to avoid confusion
us_restaurants = us_restaurants.withColumnRenamed("stars", "avg_star")
review = review.withColumnRenamed("stars", "review_star")

In [18]:
# Merge the dataframes on 'business_id'
restaurants_reviews = us_restaurants.join(review, on="business_id", how="inner")

In [60]:
# Print the schema of merged dataframe
restaurants_reviews.printSchema()

## Preprocessing of Restaurants_Reviews Dataset

In [19]:
# Create a 'labels' column
# 4-5 Stars --> Positive
# 3 Stars --> Neutral
# 1-2 Stars --> Negative
restaurants_reviews = restaurants_reviews.withColumn(
    "labels",
    when(restaurants_reviews["review_star"] >= 4, "positive")
    .when(restaurants_reviews["review_star"] == 3, "neutral")
    .when(restaurants_reviews["review_star"] < 3, "negative"),
)

In [20]:
# Filter out neutral reviews and count the rest
restaurants_reviews = restaurants_reviews.where(
    restaurants_reviews["labels"] != "neutral"
)
print(f'The number of positive and negative reviews : {restaurants_reviews.count()}')

In [21]:
# Show the distribution of reviews by category
restaurants_reviews.groupBy('category').count().sort('count',ascending = False).show()

## Data Transformation on Reviews

In [240]:
# Show 'text' column before transformation
restaurants_reviews.select("text").show(10, truncate=80)

In [22]:
# Convert 'text' to lowercase
restaurants_reviews = restaurants_reviews.withColumn("text", F.lower(F.col("text")))

In [23]:
# Transform review text by using regular expressions

# Replace all non-alphanumeric characters with a whitespace.
restaurants_reviews = restaurants_reviews.withColumn(
    "text_clean", F.regexp_replace(F.col("text"), "[^a-zA-Z0-9\s]", " ")
)

# Replace all line break character with a whitespace.
restaurants_reviews = restaurants_reviews.withColumn(
    "text_clean", F.regexp_replace(F.col("text_clean"), "\n", " ")
)

# Replace all consecutive whitespaces with a single whitespace.
restaurants_reviews = restaurants_reviews.withColumn(
    "text_clean", F.regexp_replace(F.col("text_clean"), "\\s+", " ")
)

# Delete all whitespace characters at the end of each string.
restaurants_reviews = restaurants_reviews.withColumn(
    "text_clean", F.regexp_replace(F.col("text_clean"), "\s+$", "")
)

In [24]:
# Show 'text_clean' column after transformation
restaurants_reviews.select("text_clean").show(10, truncate=50)

In [25]:
#Drop unnecessary columns
restaurants_reviews = restaurants_reviews.select(['category','text_clean','labels'])

In [26]:
#Print the schema
restaurants_reviews.printSchema()

In [27]:
#Cache the processed DataFrame
restaurants_reviews.cache()
restaurants_reviews.storageLevel

## Positive and Negative Words

In [28]:
# There are 2 .txt files in the main directory where positive and negative words in English language are collected.
# Mentioned text files will be used to evaluate whether a word is positive or negative.

# Read text files and create RDDs
positive_rdd = spark.sparkContext.textFile(positive_path)
negative_rdd = spark.sparkContext.textFile(negative_path)

# Convert RDDs into lists
positive_words = positive_rdd.collect()
negative_words = negative_rdd.collect()

# Create lists with useless words that doesn't give any valuable info about restaurants.
positive_useless = [ "great", "amazing", "love", "best", "awesome", "excellent", "good", "favorite", "loved", "perfect", "gem", "perfectly",
"wonderful", "happy", "enjoyed", "nice", "well", "super", "like", "better", "decent", "fine", "pretty", "enough", "excited", "impressed", 
"ready", "fantastic", "glad", "right", "fabulous", ]
negative_useless = [ "bad", "disappointed", "unfortunately", "disappointing", "horrible", "lacking", "terrible", "sorry", "disappoint", "worst", ]

# Filter useless words out
positive_words = [x for x in positive_words if x not in positive_useless]
negative_words = [x for x in negative_words if x not in negative_useless]

# Print first 10 elements of lists
print(positive_words[:10])
print(negative_words[:10])

In [29]:
# Define a function that filters rows based on the 'category' parameter
def get_dataset(category):
    """Takes in a category name as string. Returns a DataFrame with filtered rows based on category argument."""

    # Filter DataFrame to only include reviews for the given category
    df = restaurants_reviews.filter(col("category") == category)

    # Select relevant columns
    df = df.select(["text_clean","labels"])

    return df

In [30]:
def filter_words(col):
    text = [i for i in col.split() if i in positive_words + negative_words]
    return text

After this point, all the processes will be implemented for 'African' restaurants.

In [132]:
African = get_dataset("African")

In [133]:
African = African.withColumn("text_filtered", udf(filter_words, returnType=ArrayType(StringType()))(col("text_clean")))
#African = African.drop('text_clean')

In [54]:
African.show(5, truncate = 40)

In [37]:
African.dtypes

In [134]:
vectorizer = CountVectorizer(inputCol = 'text_filtered', outputCol = 'vector')
print(type(vectorizer))
# Fit the CountVectorizer object to the training data
vectorizer_model = vectorizer.fit(African)

In [135]:
African_vec = vectorizer_model.transform(African)

In [136]:
African_vec = African_vec.withColumn('labels_bool', when(col('labels') == 'positive' , 1).otherwise(0))

In [121]:
African_vec.show(5,truncate = 180)

In [137]:
lr_model = LogisticRegression(featuresCol = 'vector', labelCol = 'labels_bool').fit(African_vec)

In [138]:
coeff_arr = lr_model.coefficients.toArray()
coeff_arr[:10]

In [139]:
coeff_arr = [float(coeff) for coeff in coeff_arr]
coeff_arr[:10]

In [140]:
vocab_arr = vectorizer_model.vocabulary
vocab_arr[:10]

In [141]:
data = [(vocab,coeff) for (vocab,coeff) in zip(vocab_arr,coeff_arr)]
data[:3]

In [142]:
African_df = spark.createDataFrame(data,['word','coeff'])
African_df.show(3)

In [143]:
#Inspection
African_df.sort('coeff',ascending = False).show(10)

In [145]:
from pyspark.sql.functions import array_contains
African.filter(array_contains(col('text_filtered'),'bored')).show(truncate = 100)

In [120]:
print(African_df.count())

In [125]:
African.groupby('labels').count().show()

In [None]:
# Kelime başına frequency nasıl bulunacak?

In [95]:
"""
coeff = svm.coef_[0]
Korean_words_score = pd.DataFrame(
    {"score": coeff, "word": vectorizer.get_feature_names()}
)
"""

In [50]:
Korean_words_score

In [79]:
## get frequency of each word in all reviews in specific category
Korean_reviews = pd.DataFrame(
    vectorizer_model.toarray(), columns=vectorizer.get_feature_names()
)
Korean_reviews["labels"] = class_train
Korean_frequency = Korean_reviews[Korean_reviews["labels"] == "negative"].sum()[:-1]

In [52]:
Korean_reviews

In [53]:
Korean_words_score.set_index("word", inplace=True)

In [80]:
Korean_polarity_score = Korean_words_score
Korean_polarity_score["frequency"] = Korean_frequency

In [55]:
print(Korean_polarity_score)

In [81]:
## calculate polarity score
Korean_polarity_score["polarity"] = (
    Korean_polarity_score.score
    * Korean_polarity_score.frequency
    / Korean_reviews.shape[0]
)

In [57]:
Korean_polarity_score

In [82]:
Korean_polarity_score.polarity = Korean_polarity_score.polarity.astype(float)
Korean_polarity_score.frequency = Korean_polarity_score.frequency.astype(float)

In [83]:
Koreann_dnm = Korean_polarity_score[Korean_polarity_score.polarity < 0].sort_values(
    "polarity", ascending=True
)

In [84]:
Koreann_dnm[:10]

In [85]:
negative_top10 = Koreann_dnm[:10]

In [78]:
positive_top10 = Koreann_dnm[:10]
positive_top10

In [63]:
Koreann_dnm.info()

In [115]:
import matplotlib.pyplot as plt

# Extract the index (words) and polarity score
words = positive_top10.index
polarity_scores = positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in Korean Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [114]:
# Extract the index (words) and polarity score
words = negative_top10.index
polarity_scores = negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in Korean Restaurants")
plt.xlim(0.003, 0.013)

# Show the plot
plt.tight_layout()
plt.savefig("red_plot.png")
plt.show()

In [128]:
from pyspark.sql.functions import col, split
from sklearn.feature_extraction.text import CountVectorizer


def get_polarity_score(dataset, review_type):
    dataset = dataset.withColumn("text", F.udf(filter_words)(F.col("text")))

    terms_train = dataset.select("text").rdd.flatMap(lambda row: row).collect()
    class_train = dataset.select("labels").rdd.flatMap(lambda row: row).collect()

    terms_test = dataset.select("text").rdd.flatMap(lambda row: row).collect()
    class_test = dataset.select("labels").rdd.flatMap(lambda row: row).collect()

    vectorizer = CountVectorizer()
    vectorizer_model = vectorizer.fit_transform(terms_train)

    svm = LinearSVC(max_iter=10000)
    svm.fit(vectorizer_model, class_train)

    coeff = svm.coef_[0]
    cuisine_words_score = pd.DataFrame(
        {"score": coeff, "word": vectorizer.get_feature_names()}
    )

    cuisine_reviews = pd.DataFrame(
        vectorizer_model.toarray(), columns=vectorizer.get_feature_names()
    )
    cuisine_reviews["labels"] = class_train

    if review_type == "positive":
        cuisine_frequency = cuisine_reviews[
            cuisine_reviews["labels"] == "positive"
        ].sum()[:-1]
    else:
        cuisine_frequency = cuisine_reviews[
            cuisine_reviews["labels"] == "negative"
        ].sum()[:-1]

    cuisine_words_score.set_index("word", inplace=True)
    cuisine_polarity_score = cuisine_words_score
    cuisine_polarity_score["frequency"] = cuisine_frequency

    cuisine_polarity_score["polarity"] = (
        cuisine_polarity_score.score
        * cuisine_polarity_score.frequency
        / cuisine_reviews.shape[0]
    )

    ## drop unnecessary words
    unuseful_positive_words = cuisine_polarity_score.loc[
        [
            "great",
            "amazing",
            "love",
            "best",
            "awesome",
            "excellent",
            "good",
            "favorite",
            "loved",
            "perfect",
            "gem",
            "perfectly",
            "wonderful",
            "happy",
            "enjoyed",
            "nice",
            "well",
            "super",
            "like",
            "better",
            "decent",
            "fine",
            "pretty",
            "enough",
            "excited",
            "impressed",
            "ready",
            "fantastic",
            "glad",
            "right",
            "fabulous",
        ]
    ]
    unuseful_negative_words = cuisine_polarity_score.loc[
        [
            "bad",
            "disappointed",
            "unfortunately",
            "disappointing",
            "horrible",
            "lacking",
            "terrible",
            "sorry",
            "disappoint",
            "worst",
        ]
    ]

    unuseful_words = unuseful_positive_words + unuseful_negative_words
    cuisine_polarity_score.drop(unuseful_words.index, axis=0, inplace=True)

    cuisine_polarity_score.polarity = cuisine_polarity_score.polarity.astype(float)
    cuisine_polarity_score.frequency = cuisine_polarity_score.frequency.astype(float)

    return cuisine_polarity_score

In [129]:
Japanese_reviews = get_dataset("Japanese")
Japanese_train, Japanese_test = Japanese_reviews.randomSplit([0.5, 0.5])

In [130]:
Japanese_polarity_score = get_polarity_score(Japanese_train, "positive")

In [143]:
Japanese_polarity_score = get_polarity_score(Japanese_train, "negative")

In [144]:
Japanese_dnm = Japanese_polarity_score[
    Japanese_polarity_score.polarity < 0
].sort_values("polarity")
Japanese_negative_top10 = Japanese_dnm[:10]

In [131]:
Japanese_dnm = Japanese_polarity_score[
    Japanese_polarity_score.polarity > 0
].sort_values("polarity", ascending=False)
Japanese_positive_top10 = Japanese_dnm[:10]

In [132]:
Japanese_positive_top10

In [133]:
Japanese_dnm = Japanese_polarity_score[
    Japanese_polarity_score.polarity < 0
].sort_values("polarity")
Japanese_negative_top10 = Japanese_dnm[:10]

In [134]:
Japanese_negative_top10

In [136]:
# Extract the index (words) and polarity score
words = Japanese_positive_top10.index
polarity_scores = Japanese_positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in Japanese Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [145]:
words = Japanese_negative_top10.index
polarity_scores = Japanese_negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in Japanese Restaurants")
plt.xlim(0.003, 0.015)

# Show the plot
plt.tight_layout()
plt.show()

In [141]:
Thai_reviews = get_dataset("Thai")
Thai_train, Thai_test = Thai_reviews.randomSplit([0.5, 0.5])

Thai_polarity_score = get_polarity_score(Thai_train, "positive")

Thai_dnm = Thai_polarity_score[Thai_polarity_score.polarity > 0].sort_values(
    "polarity", ascending=False
)
Thai_positive_top10 = Thai_dnm[:10]

In [142]:
Thai_positive_top10

In [146]:
# Extract the index (words) and polarity score
words = Thai_positive_top10.index
polarity_scores = Thai_positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in Thai Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [147]:
Thai_polarity_score = get_polarity_score(Thai_train, "negative")

Thai_dnm = Thai_polarity_score[Thai_polarity_score.polarity < 0].sort_values("polarity")
Thai_negative_top10 = Thai_dnm[:10]

Thai_negative_top10

In [148]:
words = Thai_negative_top10.index
polarity_scores = Thai_negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in Thai Restaurants")
# plt.xlim(0.00, 0.013)

# Show the plot
plt.tight_layout()
plt.show()

In [149]:
Chinese_reviews = get_dataset("Chinese")
Chinese_train, Chinese_test = Chinese_reviews.randomSplit([0.5, 0.5])

Chinese_polarity_score = get_polarity_score(Chinese_train, "positive")

Chinese_dnm = Chinese_polarity_score[Chinese_polarity_score.polarity > 0].sort_values(
    "polarity", ascending=False
)
Chinese_positive_top10 = Chinese_dnm[:10]

Chinese_positive_top10

In [150]:
# Extract the index (words) and polarity score
words = Chinese_positive_top10.index
polarity_scores = Chinese_positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in Chinese Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [152]:
Chinese_polarity_score = get_polarity_score(Chinese_train, "negative")

Chinese_dnm = Chinese_polarity_score[Chinese_polarity_score.polarity < 0].sort_values(
    "polarity"
)
Chinese_negative_top10 = Chinese_dnm[:10]

Chinese_negative_top10

In [153]:
words = Chinese_negative_top10.index
polarity_scores = Chinese_negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in Chinese Restaurants")
plt.xlim(0.0050, 0.018)

# Show the plot
plt.tight_layout()
plt.show()

In [154]:
Vietnamese_reviews = get_dataset("Vietnamese")
Vietnamese_train, Vietnamese_test = Vietnamese_reviews.randomSplit([0.5, 0.5])

Vietnamese_polarity_score = get_polarity_score(Vietnamese_train, "positive")

Vietnamese_dnm = Vietnamese_polarity_score[
    Vietnamese_polarity_score.polarity > 0
].sort_values("polarity", ascending=False)
Vietnamese_positive_top10 = Vietnamese_dnm[:10]

Vietnamese_positive_top10

In [155]:
# Extract the index (words) and polarity score
words = Vietnamese_positive_top10.index
polarity_scores = Vietnamese_positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in Vietnamese Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [156]:
Vietnamese_polarity_score = get_polarity_score(Vietnamese_train, "negative")

Vietnamese_dnm = Vietnamese_polarity_score[
    Vietnamese_polarity_score.polarity < 0
].sort_values("polarity")
Vietnamese_negative_top10 = Vietnamese_dnm[:10]

Vietnamese_negative_top10

In [157]:
words = Vietnamese_negative_top10.index
polarity_scores = Vietnamese_negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in Vietnamese Restaurants")
plt.xlim(0.002, 0.016)

# Show the plot
plt.tight_layout()
plt.show()

In [158]:
French_reviews = get_dataset("French")
French_train, French_test = French_reviews.randomSplit([0.5, 0.5])

French_polarity_score = get_polarity_score(French_train, "positive")

French_dnm = French_polarity_score[French_polarity_score.polarity > 0].sort_values(
    "polarity", ascending=False
)
French_positive_top10 = French_dnm[:10]

French_positive_top10

In [159]:
# Extract the index (words) and polarity score
words = French_positive_top10.index
polarity_scores = French_positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in French Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [160]:
French_polarity_score = get_polarity_score(French_train, "negative")

French_dnm = French_polarity_score[French_polarity_score.polarity < 0].sort_values(
    "polarity"
)
French_negative_top10 = French_dnm[:10]

French_negative_top10

In [161]:
words = French_negative_top10.index
polarity_scores = French_negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in French Restaurants")
plt.xlim(0.003, 0.015)

# Show the plot
plt.tight_layout()
plt.show()

In [162]:
Italian_reviews = get_dataset("Italian")
Italian_train, Italian_test = Italian_reviews.randomSplit([0.5, 0.5])

Italian_polarity_score = get_polarity_score(Italian_train, "positive")

Italian_dnm = Italian_polarity_score[Italian_polarity_score.polarity > 0].sort_values(
    "polarity", ascending=False
)
Italian_positive_top10 = Italian_dnm[:10]

Italian_positive_top10

In [163]:
# Extract the index (words) and polarity score
words = Italian_positive_top10.index
polarity_scores = Italian_positive_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores, color="#0B4F6C")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Positive Reviews in Italian Restaurants")

# Show the plot
plt.tight_layout()
plt.show()

In [164]:
Italian_polarity_score = get_polarity_score(Italian_train, "negative")

Italian_dnm = Italian_polarity_score[Italian_polarity_score.polarity < 0].sort_values(
    "polarity"
)
Italian_negative_top10 = Italian_dnm[:10]

Italian_negative_top10

In [165]:
words = Italian_negative_top10.index
polarity_scores = Italian_negative_top10["polarity"]

words = words[::-1]
polarity_scores = polarity_scores[::-1]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.barh(words, polarity_scores * (-1), color="#B80C09")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Set labels and title
plt.xlabel("Polarity Score")
plt.ylabel("Words")
plt.title("Polarity Scores of Words for Negative Reviews in Italian Restaurants")
plt.xlim(0.003, 0.013)

# Show the plot
plt.tight_layout()
plt.show()