Rating and Sentiment Score Threshold

In [6]:
import pandas as pd
from textblob import TextBlob

# Read the CSV file into a DataFrame
df = pd.read_csv('final_dataset.csv')

# Calculate the combined rating of all users
combined_rating = df['stars'].mean()

# Determine the threshold rating based on the combined rating
rating_threshold = combined_rating

# Calculate the sentiment score for each review
df['sentiment_score'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculate the combined sentiment score of all reviews
combined_sentiment_score = df['sentiment_score'].mean()

# Determine the threshold sentiment score based on the combined sentiment score
sentiment_threshold = combined_sentiment_score

# Label all reviews based on the condition
df['rule1_threshold'] = df.apply(lambda row: 0 if row['stars'] < rating_threshold and row['sentiment_score'] < sentiment_threshold else 1, axis=1)

# Save the DataFrame with the new column to a new CSV file
df.to_csv('threshold.csv', index=False)

# Display the thresholds and the number of filtered reviews


num_filtered_reviews = len(df)
rule_counts = df['rule1_threshold'].value_counts()

# Display the counts
print("Count of reviews with rule1_threshold equal to 0 and 1:")
print(rule_counts)
print(f"Rating Threshold: {rating_threshold}")
print(f"Sentiment Score Threshold: {sentiment_threshold}")
print(f"Number of Reviews: {num_filtered_reviews}")


Count of reviews with rule1_threshold equal to 0 and 1:
rule1_threshold
1    41102
0    12358
Name: count, dtype: int64
Rating Threshold: 3.8563973063973065
Sentiment Score Threshold: 0.23259107181563363
Number of Reviews: 53460


Useful Count Threshold Calculation

In [9]:
import pandas as pd

# Read the existing threshold_values.csv
threshold_df = pd.read_csv('threshold.csv')

# Calculate the average of the 'useful' column
average_useful = threshold_df['useful'].mean()

# Define the threshold
threshold = average_useful

# Label the reviews based on the threshold

threshold_df['rule5_threshold'] = threshold_df['useful'].apply(lambda x: 1 if x > threshold else 0)

# Save the modified DataFrame back to threshold_values.csv
threshold_df.to_csv('threshold.csv', index=False)

# Count reviews above and below threshold
above_threshold_count = threshold_df[threshold_df['useful'] > threshold].shape[0]
below_threshold_count = threshold_df[threshold_df['useful'] <= threshold].shape[0]

# Print the results
print(f'Average useful counts of all reviews: {average_useful}')
print(f'Number of reviews above threshold: {above_threshold_count}')
print(f'Number of reviews below threshold: {below_threshold_count}')


Average useful counts of all reviews: 2.151889263000374
Number of reviews above threshold: 15213
Number of reviews below threshold: 38247


Average Length Threshold

In [10]:
import pandas as pd

# Read the data from the CSV file
threshold_df = pd.read_csv('threshold.csv')

# Calculate the average text length
average_text_length = threshold_df['text'].apply(len).mean()

# Define the threshold
threshold = average_text_length

# Label the reviews based on the threshold
threshold_df['rule2_threshold'] = threshold_df['text'].apply(lambda x: 1 if len(x) > threshold else 0)

# Save the updated DataFrame back to the same CSV file
threshold_df.to_csv('threshold.csv', index=False)

# Print the statistics
print("Average Text Length:", average_text_length)
print("Threshold Value:", threshold)
print("Number of Reviews Above Threshold:", threshold_df['rule2_threshold'].sum())
print("Number of Reviews Below or Equal to Threshold:", (threshold_df['rule2_threshold'] == 0).sum())


Average Text Length: 773.362532734755
Threshold Value: 773.362532734755
Number of Reviews Above Threshold: 20194
Number of Reviews Below or Equal to Threshold: 33266


Sentiment Score for each review

In [12]:
import pandas as pd
from textblob import TextBlob

# Read the CSV file into a DataFrame
df = pd.read_csv('final_dataset.csv')

# Define a function to calculate sentiment score using TextBlob
def calculate_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    return sentiment_score

# Apply the function to calculate sentiment scores for each review text
df['sentiment_score'] = df['text'].apply(calculate_sentiment)

# Save the DataFrame with sentiment scores to a new CSV file
df.to_csv('CleanedDataset_with_sentiment_scores.csv', index=False)

# Display the DataFrame with sentiment scores
print(df[['text', 'sentiment_score']])


                                                    text  sentiment_score
0      Remarkable food with beach access for the whol...         0.405455
1      I loved everything about this lovely train sta...         0.598016
2      The Praline Connection makes a mean po' boy.  ...         0.024826
3      We walked over to Tennessee Brew Works, one of...         0.144998
4      Logan Circle, also known as Logan Square, is a...         0.140657
...                                                  ...              ...
53455  I thought this place was great!! We sat outsid...         0.281754
53456  Really cute set up and the food is super fresh...         0.338333
53457  This was my first time staying at a JW Marriot...         0.177450
53458  Ended up here twice on our recent trip to Phil...         0.203958
53459  If you are looking for greek food this is the ...         0.297286

[53460 rows x 2 columns]


In [2]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 388.9 kB/s eta 0:01:50
     --------------------------------------- 0.1/42.8 MB 655.4 kB/s eta 0:01:06
     ---------------------------------------- 0.2/42.8 MB 1.4 MB/s eta 0:00:32
     ---------------------------------------- 0.3/42.8 MB 1.4 MB/s eta 0:00:31
     ---------------------------------------- 0.3/42.8 MB 1.4 MB/s eta 0:00:30
     ---------------------------------------- 0.3/42.8 MB 1.4 MB/s eta 0:00:30
     ---------------------------------------- 0.3/42.8 MB 1.4 MB/s eta 0:00:30
     ---------------------------------------- 0.3/42.8 MB 1.4 MB/s eta 0:00:30
     ---------------------------------------- 

User id, Similarity Score, Time Threshold

In [3]:
print(df.columns)

Index(['_id', 'user_id', 'name', 'review_count', 'useful', 'review_id',
       'stars', 'useful_review', 'text', 'date', 'time'],
      dtype='object')


Rating with Timeframe

In [11]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('threshold.csv')

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Group the reviews by 'user_id' and 'date', and count the number of reviews for each user on each day

user_reviews_per_day = df.groupby(['user_id', df['date'].dt.date]).size()

# Calculate the combined average number of reviews per day
combined_avg_reviews_per_day = user_reviews_per_day.mean()

# Determine the threshold based on the combined average number of reviews per day
threshold_reviews_per_day = combined_avg_reviews_per_day

# Calculate the combined rating of all users

combined_rating = df['stars'].mean()

# Determine the rating threshold based on the combined rating

threshold_rating =  combined_rating  # Adjust the threshold as needed

# Filter the users based on whether the number of reviews on any single day exceeds the threshold
# and the rating is below the rating threshold

filtered_users = user_reviews_per_day[(user_reviews_per_day > threshold_reviews_per_day)].reset_index()

# Filter the original DataFrame based on the filtered users

filtered_reviews = df[df['user_id'].isin(filtered_users['user_id'])]

filtered_reviews = filtered_reviews[filtered_reviews['stars'] < threshold_rating]

# Store the filtered reviews in a new CSV file

filtered_reviews.to_csv('filtered_reviews_threshold.csv', index=False)

# Display the threshold and the number of filtered reviews

num_filtered_reviews = len(filtered_reviews)
print(f"Reviews per Day Threshold: {threshold_reviews_per_day}")
print(f"Rating Threshold: {threshold_rating}")
print(f"Number of Filtered Reviews: {num_filtered_reviews}")


Reviews per Day Threshold: 1.0805655999832786
Rating Threshold: 3.863291261666425
Number of Filtered Reviews: 12483


Rule 3 : Similarity Score

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the CSV file into a DataFrame
df = pd.read_csv('btp.csv')

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define the threshold similarity value
threshold_similarity = 0.7  # You can adjust this value based on your requirements

# Filter reviews based on similarity threshold
similar_reviews_indices = [(i, j) for i in range(len(cosine_sim)) for j in range(i+1, len(cosine_sim)) if cosine_sim[i][j] >= threshold_similarity]

# Filtered DataFrame based on similarity threshold
similar_reviews_df = df.iloc[[index for pair in similar_reviews_indices for index in pair]]

# Save the similar reviews to a new CSV file
similar_reviews_df.to_csv('similar_reviews.csv', index=False)


# Display the number of similar reviews
num_similar_reviews = len(similar_reviews_df)
print(f"Number of Similar Reviews: {num_similar_reviews}")


Number of Similar Reviews: 52


In [28]:
import pandas as pd
import spacy
from datetime import datetime

# Load English tokenizer, tagger, parser, and word vectors
nlp = spacy.load("en_core_web_md")

# Read the CSV file
df = pd.read_csv('CleanedDataset.csv')

# Preprocess text data
df['text'] = df['text'].apply(lambda text: text.lower())

# Vectorize text data
df['text_vector'] = list(nlp.pipe(df['text']))

# Define function to calculate similarity score
def calculate_similarity(text1, text2):
    return text1.similarity(text2)

# Process batch
def process_batch(batch):
    text1 = batch.iloc[0]['text_vector']
    batch['similarity_score'] = batch['text_vector'].apply(lambda text2: calculate_similarity(text1, text2))
    return batch

# Batch processing
batch_size = 1000
num_batches = len(df) // batch_size + 1

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch = df.iloc[start_idx:end_idx]
    df.iloc[start_idx:end_idx] = process_batch(batch)

# Calculate average similarity score per user
user_similarity = df.groupby('user_id')['similarity_score'].mean()

# Calculate average time difference per user
df['time'] = pd.to_datetime(df['time'])
user_time_difference = df.groupby('user_id')['time'].apply(lambda x: (x.max() - x.min()).total_seconds()).mean()

# Calculate the threshold
threshold = user_similarity.mean() * user_time_difference

# Count reviews above and below threshold
above_threshold_count = (df['similarity_score'] > threshold).sum()
below_threshold_count = (df['similarity_score'] <= threshold).sum()

# Print the results
print(f'Threshold: {threshold}')

print(f'Number of reviews above threshold: {above_threshold_count}')
print(f'Number of reviews below threshold: {below_threshold_count}')


Threshold: 22150.528374546026
Number of reviews above threshold: 0
Number of reviews below threshold: 1000


  df['time'] = pd.to_datetime(df['time'])
