In [2]:
import json
import pandas as pd
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col,lower
import pyspark.sql.functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

## 1. Read the cleaned dataset

In [4]:
with open('/Users/celine/Desktop/5430nlp/group/English_Only_More_than_10_reviews.json', 'r') as f:
    data = [json.loads(line) for line in f.readlines()]

reviews = [article['text'] for article in data]
len(reviews), len(set(reviews))

(128419, 128224)

In [5]:
len(set(item['business_id'] for item in data))

2276

## 2. Making copies of the original articles to avoid changing the imported data

In [6]:
feeds = []
for i, a in enumerate(data):
    feed = a.copy() 
    feed['review_id'] = i
    feeds.append(feed)

## 3. Run deduplication within each business

#### 1. Construct a list of list, each outer list is divided by business_id, each inter list is divided by review for that business_id

In [8]:
# Initialize a list to store reviews with the same business_id
business_id_list = []
BusinessReviews = []

# Iterate through the feeds to filter reviews with the same business_id
for feed in feeds:
    selBusinessID = feed['business_id']
    
    if selBusinessID not in business_id_list:
        
        sameBusinessReviews = [review for review in feeds if review['business_id'] == selBusinessID]
        BusinessReviews.append(sameBusinessReviews)
        business_id_list.append(selBusinessID)

len(BusinessReviews)

2276

In [9]:
from simhash import Simhash, SimhashIndex

f = 64 # the number of bits, the default is 64

objs = {}  # a dictionary with business_id as keys and list of (review_id, simhash) as values

for business_reviews in BusinessReviews:
    business_id = str(business_reviews[0]['business_id'])  # Assuming 'business_id' is present in the first review
    objs[business_id] = []
    
    for review in business_reviews:
        review_id = str(review['review_id'])
        text = str(review['text'])
        sh = Simhash(text, f=f)
        objs[business_id].append((review_id, sh))
        
len(objs)

2276

In [10]:
#123861
hamming_distance = 15

deduplicated_feed_data = []

for all_reviews in BusinessReviews:
    businessid = str(all_reviews[0]['business_id'])
    obj = objs.get(businessid, [])
    index = SimhashIndex(obj, k=hamming_distance, f=f)
    
    for business in all_reviews:
        selID = business['review_id']
        selFeed = feeds[selID]
        selHash = Simhash(str(selFeed['text']), f=64)
        
    # Get near duplicates using SimhashIndex
        duplicates = index.get_near_dups(selHash)
            
        if (len(duplicates) == 1) or (str(selID) == min(duplicates)):
            deduplicated_feed_data.append(selFeed)
                        
print(len(deduplicated_feed_data))

Big bucket found. key:a:0, len:220
Big bucket found. key:a:4, len:204
Big bucket found. key:7:1, len:243
Big bucket found. key:6:2, len:270
Big bucket found. key:a:f, len:210
Big bucket found. key:a:0, len:220
Big bucket found. key:7:1, len:243
Big bucket found. key:6:2, len:270
Big bucket found. key:a:0, len:220
Big bucket found. key:a:4, len:204
Big bucket found. key:7:1, len:243
Big bucket found. key:a:4, len:204
Big bucket found. key:a:0, len:220
Big bucket found. key:7:1, len:243
Big bucket found. key:6:2, len:270
Big bucket found. key:a:4, len:204
Big bucket found. key:a:f, len:210
Big bucket found. key:6:2, len:270
Big bucket found. key:6:2, len:270
Big bucket found. key:7:1, len:243
Big bucket found. key:6:2, len:270
Big bucket found. key:a:f, len:210
Big bucket found. key:a:f, len:210
Big bucket found. key:a:4, len:204
Big bucket found. key:a:0, len:220
Big bucket found. key:a:0, len:220
Big bucket found. key:7:1, len:243
Big bucket found. key:a:4, len:204
Big bucket found. ke

123861


## 4. Evaluation of SimHash Performance

##### In summary, for text-based reviews, cosine similarity is often more widely used, especially when considering the overall content and importance of words in the reviews. 

In [11]:
# Convert deduplicated data to pandas dataframe
data = pd.DataFrame(deduplicated_feed_data)
# Creating a new DataFrame with selected columns
selected_columns = ['business_id', 'text']
df = data[selected_columns].copy()

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to calculate average cosine similarity for each business_id
def calculate_max_similarity(df):
    # Create an empty list to store the results
    result_data = []

    # Iterate over unique business_id values
    for business_id in df['business_id'].unique():
        # Filter DataFrame for the current business_id
        subset_df = df[df['business_id'] == business_id]

        # Use TF-IDF vectorization for the review texts
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(subset_df['text'])
        
        # Calculate cosine similarity matrix
        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
        
        # Set diagonal elements to zero to exclude self-similarity
        np.fill_diagonal(similarity_matrix, 0)

        # Calculate the max similarity score for each business
        max_similarity = similarity_matrix.max()

        # Append the results to the list as a dictionary
        result_data.append({'business_id': business_id,
                            'max_similarity': max_similarity})

    # Create a DataFrame directly from the list of dictionaries
    result_df = pd.DataFrame(result_data)

    return result_df


# Example usage
result_df_max_similarity = calculate_max_similarity(df)
result_df_max_similarity = result_df_max_similarity.sort_values(by='max_similarity', ascending=False)
result_df_max_similarity.head()

Unnamed: 0,business_id,max_similarity
1101,_Q3XmBesceFqn9Dk-w6tlA,0.89431
1894,6CUkqed0rhHtFwQ2i7Otsw,0.883174
1196,5Md0YaxD5HiOoBmsnmIu7A,0.867147
1084,CpV8X8DHk2VqWo11CEde-w,0.854197
1119,nMDteyIwo3zLbqm45xBwFw,0.844302


In [13]:
# Check the business with highest similarity score and its review, ensures no duplicated reviews
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df222 = df[df['business_id'] == '_Q3XmBesceFqn9Dk-w6tlA'].sort_values(by='text', ascending=False)
df222.head()

Unnamed: 0,business_id,text
61763,_Q3XmBesceFqn9Dk-w6tlA,wow. cream cheese croissant? almond sticks? egg sandwiches. This is the best bakery ever.
61787,_Q3XmBesceFqn9Dk-w6tlA,"Years ago French family and friends recommended this spot for the brioche, back at their old location.\nMy favorite order is the sausage egg & cheese on a croissant and an almond croissant. Very buttery and greasy if that's your thing.\nRecommend calling ahead to see if they're open because sometimes they are randomly closed."
61796,_Q3XmBesceFqn9Dk-w6tlA,"What a hidden gem!!! I discovered Artisan Boulanger from someone's Instagram story and their pastries looked amazing so I had to try for myself. I got the sausage egg and cheese on a croissant and I think it might be the best croissant breakfast sandwich that I've ever had. The croissant was perfect, flakey and incredibly delicious without being super messy and getting everywhere. I know sometimes there's a line here because it's a small shop, but it is definitely worth the wait."
61781,_Q3XmBesceFqn9Dk-w6tlA,"Tied for second so far for the hunt for best croissants, chocolate croissants (and affordable !) in the city. Will come back for more !"
61804,_Q3XmBesceFqn9Dk-w6tlA,"This spot has one of the best baguettes in the city. There is always a line but never too long and always worth it. Their bread is so flavorful. It is soft in the middle but the crust has a nice chew, which is how I like it. Their croissants are amazing. I prefer the plain but the chocolate is good too. Their beignets are delicious. Cant wait to try their bahn mis. Cash only."


## 5. Export Json File

In [11]:
with open('/Users/celine/Desktop/5430nlp/group/deduplicated_reviews.json', 'w') as f:
    for article in deduplicated_feed_data:
        json.dump(article, f)
        f.write('\n')
print(f'Number of non-duplicate articles: {len(deduplicated_feed_data)}')

Number of non-duplicate articles: 123861
