In [1]:
# Install & Import Libraries
import os
import pandas as pd
import nltk
import string
import zipfile
import shutil
from itertools import combinations
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yanni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Kaggle API Setup (Requires kaggle.json or env variables)
os.environ['KAGGLE_USERNAME'] = "yannisleivaditis"  # Replace with your Kaggle username
os.environ['KAGGLE_KEY'] = "84ba8bb53bcf2e0d8ede71a6e9fba8fd"  # Replace with your Kaggle key


In [3]:
# Download Dataset from Kaggle
import os

KAGGLE_DATASET = "mohamedbakhet/amazon-books-reviews"
DOWNLOAD_DIR = "kaggle_dataset"
csv_file_name = "Books_rating.csv"
csv_file_path = os.path.join(DOWNLOAD_DIR, csv_file_name)

# Create directory if it doesn't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Download only if the CSV doesn't already exist
if not os.path.isfile(csv_file_path):
    print(f"{csv_file_name} not found. Downloading from Kaggle...")
    os.system(f'kaggle datasets download -d {KAGGLE_DATASET} -p {DOWNLOAD_DIR} --unzip')
else:
    print(f"{csv_file_name} already exists. Skipping download.")

# Load and Check Dataset
if not os.path.isfile(csv_file_path):
    raise FileNotFoundError(f"{csv_file_name} not found in {DOWNLOAD_DIR}")



Books_rating.csv already exists. Skipping download.


In [4]:
# Configuration
STOP_WORDS = set(stopwords.words('english'))
REVIEW_COLUMN = 'review/text'
MAX_REVIEWS = 100  # Limit reviews for performance
TOP_K = 3          # Top similar pairs to show


In [5]:
# Text Preprocessing
def preprocess_text(text):
    text = text.lower()
    for p in string.punctuation:
        text = text.replace(p, '')
    return [word for word in text.split() if word not in STOP_WORDS]


In [6]:
# Jaccard Similarity Function
def jaccard_similarity(tokens1, tokens2):
    intersection = []
    for w in tokens1:
        if w in tokens2 and w not in intersection:
            intersection.append(w)

    union = list(tokens1)
    for w in tokens2:
        if w not in union:
            union.append(w)

    if len(union) == 0:
        return 0.0
    return len(intersection) / len(union)


In [7]:
# Load and Preprocess Reviews
def load_reviews(path):
    df = pd.read_csv(path)
    if REVIEW_COLUMN not in df.columns:
        raise ValueError(f"Column '{REVIEW_COLUMN}' not found in the dataset.")
    df = df[[REVIEW_COLUMN]].dropna().drop_duplicates()
    df = df[df[REVIEW_COLUMN].str.strip() != '']
    df = df.head(MAX_REVIEWS).reset_index(drop=True)
    df['tokens'] = df[REVIEW_COLUMN].apply(preprocess_text)
    return df


In [8]:
# Compare All Review Pairs
def find_similar_reviews(df):
    similarities = []
    for i, j in combinations(range(len(df)), 2):
        sim = jaccard_similarity(df.at[i, 'tokens'], df.at[j, 'tokens'])
        similarities.append(((i, j), sim))
    
    top_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:TOP_K]
    return top_similarities


In [9]:
# Display Top Similar Pairs
def print_top_similar_pairs(df, top_pairs):
    print("\n=== Top Similar Review Pairs (Jaccard Similarity) ===")
    for (i, j), sim_score in top_pairs:
        print(f"\n[Pair: Review {i} and Review {j}]")
        print(f"Jaccard Similarity: {sim_score:.4f}")
        print("→ Review", i, ":", df.at[i, REVIEW_COLUMN][:300], "...")
        print("→ Review", j, ":", df.at[j, REVIEW_COLUMN][:300], "...")


In [10]:
#  Run the Full Pipeline
df = load_reviews(csv_file_path)
top_similar = find_similar_reviews(df)
print_top_similar_pairs(df, top_similar)



=== Top Similar Review Pairs (Jaccard Similarity) ===

[Pair: Review 24 and Review 29]
Jaccard Similarity: 0.1690
→ Review 24 : I just finished reading Whisper of the Wicked saints. I fell in love with the caracters. I expected an average romance read, but instead I found one of my favorite books of all time. Just when I thought I could predict the outcome I was shocked ! The writting was so descriptive that my heart broke w ...
→ Review 29 : I am an avid reader and I was shocked at how hooked I became on this book. I thought the first chapter was a little long and a little too discriptive, but truth be told after that I could not put this down. I read the other reviews on Whispers of the wicked saints before I wrote this and I saw one b ...

[Pair: Review 52 and Review 68]
Jaccard Similarity: 0.1429
→ Review 52 : This play was excellent. It's very smart, intellectually and morally meaty, and fast. I highly recommend it. Especially good material to ponder for people who in today's age