<a href="https://colab.research.google.com/github/tanvimagdum/Customer-Feedback-Analysis-and-Summarization/blob/main/MiniProject_IDMP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("cynthiarempel/amazon-us-customer-reviews-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/cynthiarempel/amazon-us-customer-reviews-dataset?dataset_version_number=9...


100%|██████████| 21.0G/21.0G [04:21<00:00, 86.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/cynthiarempel/amazon-us-customer-reviews-dataset/versions/9


In [None]:
# Data Loading and Setup

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AmazonReviewsAnalysis").getOrCreate()
df = spark.read.csv(path, sep="\t", header=True, inferSchema=True)
df.printSchema()

In [12]:
df.printSchema()

root
 |-- marketplace\tcustomer_id\treview_id\tproduct_id\tproduct_parent\tproduct_title\tproduct_category\tstar_rating\thelpful_votes\ttotal_votes\tvine\tverified_purchase\treview_headline\treview_body\treview_date: string (nullable = true)



In [11]:
# Data Preprocessing

from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Convert text to lowercase and remove punctuation
df_cleaned = df.withColumn("review_body", lower(col("review_body")))
df_cleaned = df_cleaned.withColumn("review_body", regexp_replace(col("review_body"), "[^a-zA-Z0-9\\s]", ""))

# Tokenization
tokenizer = Tokenizer(inputCol="review_body", outputCol="tokens")
df_tokenized = tokenizer.transform(df_cleaned)

# Remove stop words
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df_filtered = remover.transform(df_tokenized)

# Filter Reviews by Ratings
df_positive = df_filtered.filter(col("star_rating") >= 4)
df_negative = df_filtered.filter(col("star_rating") <= 2)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `review_body` cannot be resolved. Did you mean one of the following? [`marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date`].;
'Project [marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date#27, lower('review_body) AS review_body#31]
+- Project [marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date#24 AS marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date#27]
   +- Project [marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date#17 AS marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date#24]
      +- Relation [marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date#17] csv


In [None]:
# Text Analysis and Processing

# Sentiment Analysis
from pyspark.sql.functions import when

# Add Sentiment Column
df_filtered = df_filtered.withColumn(
    "sentiment",
    when(col("star_rating") >= 4, "positive")
    .when(col("star_rating") <= 2, "negative")
    .otherwise("neutral")
)

# Key Phrase Extraction
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="filtered_tokens", outputCol="key_phrases", vocabSize=50)
model = cv.fit(df_filtered)
df_with_phrases = model.transform(df_filtered)


In [None]:
from pyspark.sql.functions import concat_ws

# Combine tokens back to sentences
df_filtered = df_filtered.withColumn("summary", concat_ws(" ", col("filtered_tokens")))

# Summaries for each sentiment
df_positive_summary = df_filtered.filter(col("sentiment") == "positive").select("summary").show(5, truncate=False)
df_negative_summary = df_filtered.filter(col("sentiment") == "negative").select("summary").show(5, truncate=False)


In [None]:
insights = df_filtered.groupBy("product_category", "sentiment").count()
insights.show()

In [None]:
from pyspark.sql.functions import round

total_reviews = df_filtered.groupBy("product_category").count()
sentiment_stats = insights.withColumn("percentage", round((col("count") / total_reviews["count"]) * 100, 2))
sentiment_stats.show()