Rating and Sentiment Score Threshold

In [5]:
import pandas as pd
from textblob import TextBlob

# Read the CSV file into a DataFrame
df = pd.read_csv('btp.csv')

# Calculate the combined rating of all users
combined_rating = df['stars'].mean()

# Determine the threshold rating based on the combined rating
rating_threshold = combined_rating

# Calculate the sentiment score for each review
df['sentiment_score'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculate the combined sentiment score of all reviews
combined_sentiment_score = df['sentiment_score'].mean()

# Determine the threshold sentiment score based on the combined sentiment score
sentiment_threshold =  combined_sentiment_score

# Filter reviews based on both rating and sentiment score thresholds
filtered_reviews = df[(df['stars'] < rating_threshold) & (df['sentiment_score'] < sentiment_threshold)]

# Store the filtered reviews in a new CSV file
filtered_reviews.to_csv('filtered_reviews_combined.csv', index=False)

# Display the thresholds and the number of filtered reviews
num_filtered_reviews = len(filtered_reviews)
print(f"Rating Threshold: {rating_threshold}")
print(f"Sentiment Score Threshold: {sentiment_threshold}")
print(f"Number of Filtered Reviews: {num_filtered_reviews}")


Rating Threshold: 3.863291261666425
Sentiment Score Threshold: 0.2416720672520349
Number of Filtered Reviews: 24367


Useful Count Threshold Calculation

In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('btp.csv')

# Calculate the average of the 'useful' column
average_useful = df['useful'].mean()

# Define the threshold
threshold = average_useful

# Count reviews above and below threshold
above_threshold_count = df[df['useful'] > threshold].shape[0]
below_threshold_count = df[df['useful'] <= threshold].shape[0]

# Print the results
print(f'Average useful counts of all reviews: {average_useful}')
print(f'Number of reviews above threshold: {above_threshold_count}')
print(f'Number of reviews below threshold: {below_threshold_count}')


Average useful counts of all reviews: 1134.9725808791527
Number of reviews above threshold: 17758
Number of reviews below threshold: 85637


Average Length Threshold

In [1]:
import pandas as pd

data = pd.read_csv('btp.csv')

data['text_length'] = data['text'].apply(len)

average_text_length = data['text_length'].mean()

threshold = average_text_length

above_threshold_count = (data['text_length'] > threshold).sum()

below_threshold_count = (data['text_length'] <= threshold).sum()

print("Average Text Length:", average_text_length)
print("Threshold Value:", threshold)
print("Number of User IDs with Text Length Above Threshold:", above_threshold_count)
print("Number of User IDs with Text Length Below or Equal to Threshold:", below_threshold_count)


Average Text Length: 718.2507084481841
Threshold Value: 718.2507084481841
Number of User IDs with Text Length Above Threshold: 38211
Number of User IDs with Text Length Below or Equal to Threshold: 65184


Sentiment Score for each review

In [4]:
import pandas as pd
from textblob import TextBlob

# Read the CSV file into a DataFrame
df = pd.read_csv('btp.csv')

# Define a function to calculate sentiment score using TextBlob
def calculate_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    return sentiment_score

# Apply the function to calculate sentiment scores for each review text
df['sentiment_score'] = df['text'].apply(calculate_sentiment)

# Save the DataFrame with sentiment scores to a new CSV file
df.to_csv('CleanedDataset_with_sentiment_scores.csv', index=False)

# Display the DataFrame with sentiment scores
print(df[['text', 'sentiment_score']])


                                                     text  sentiment_score
0       Remarkable food with beach access for the whol...         0.405455
1       I loved everything about this lovely train sta...         0.598016
2       The Praline Connection makes a mean po' boy.  ...         0.024826
3       We walked over to Tennessee Brew Works, one of...         0.144998
4       Logan Circle, also known as Logan Square, is a...         0.140657
...                                                   ...              ...
103390  What an awesome place with a great vibe. Very ...         0.611429
103391  This is an old style café with a lot of menu c...         0.407963
103392  Great little coffee shop. Some nice pastries r...         0.358185
103393  This is a fun place for awesome drinks and awe...         0.451000
103394  Great little coffee shop. Some nice pastries r...         0.326935

[103395 rows x 2 columns]


User id, Similarity Score, Time Threshold

In [2]:
import pandas as pd
import spacy

# Load English tokenizer, tagger, parser, and word vectors
nlp = spacy.load("en_core_web_md")

# Read the CSV file
df = pd.read_csv('btp.csv')

# Calculate similarity score for each review text
def calculate_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

# Calculate average similarity score per user
user_similarity = df.groupby('user_id').apply(lambda x: x['text'].apply(lambda y: calculate_similarity(x['text'], y)).mean())

# Calculate average time difference per user
user_time_difference = df.groupby('user_id')['time'].apply(lambda x: (x.max() - x.min()).total_seconds()).mean()

# Define a function to calculate the threshold
def calculate_threshold(user_similarity, user_time_difference):
    # You can define your custom logic here to calculate the threshold
    # For example, you can take the average similarity score
    # and user_time_difference, or use some other statistical measure.
    threshold = user_similarity.mean() * user_time_difference
    return threshold

# Calculate the threshold
threshold = calculate_threshold(user_similarity, user_time_difference)

# Count reviews above and below threshold
above_threshold_count = df[df['similarity_score'] > threshold].shape[0]
below_threshold_count = df[df['similarity_score'] <= threshold].shape[0]

# Print the results
print(f'Threshold: {threshold}')
print(f'Number of reviews above threshold: {above_threshold_count}')
print(f'Number of reviews below threshold: {below_threshold_count}')


ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'pandas.core.series.Series'>

In [3]:
print(df.columns)

Index(['_id', 'user_id', 'name', 'review_count', 'useful', 'review_id',
       'stars', 'useful_review', 'text', 'date', 'time'],
      dtype='object')


Rating with Timeframe

In [6]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('btp.csv')

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Group the reviews by 'user_id' and 'date', and count the number of reviews for each user on each day

user_reviews_per_day = df.groupby(['user_id', df['date'].dt.date]).size()

# Calculate the combined average number of reviews per day
combined_avg_reviews_per_day = user_reviews_per_day.mean()

# Determine the threshold based on the combined average number of reviews per day
threshold_reviews_per_day = combined_avg_reviews_per_day

# Calculate the combined rating of all users

combined_rating = df['stars'].mean()

# Determine the rating threshold based on the combined rating

threshold_rating =  combined_rating  # Adjust the threshold as needed

# Filter the users based on whether the number of reviews on any single day exceeds the threshold
# and the rating is below the rating threshold

filtered_users = user_reviews_per_day[(user_reviews_per_day > threshold_reviews_per_day)].reset_index()

# Filter the original DataFrame based on the filtered users

filtered_reviews = df[df['user_id'].isin(filtered_users['user_id'])]

filtered_reviews = filtered_reviews[filtered_reviews['stars'] < threshold_rating]

# Store the filtered reviews in a new CSV file

filtered_reviews.to_csv('filtered_reviews_threshold.csv', index=False)

# Display the threshold and the number of filtered reviews

num_filtered_reviews = len(filtered_reviews)
print(f"Reviews per Day Threshold: {threshold_reviews_per_day}")
print(f"Rating Threshold: {threshold_rating}")
print(f"Number of Filtered Reviews: {num_filtered_reviews}")


Reviews per Day Threshold: 1.0805655999832786
Rating Threshold: 3.863291261666425
Number of Filtered Reviews: 12483


Rule 3 : Similarity Score

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('CleanedDataset.csv')

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define the threshold similarity value
threshold_similarity = 0.7  # You can adjust this value based on your requirements

# Function to classify reviews based on similarity
def classify_reviews(similarity_matrix, threshold):
    classified_reviews = []
    num_reviews = len(similarity_matrix)
    for i in range(num_reviews):
        for j in range(i+1, num_reviews):
            if similarity_matrix[i][j] >= threshold:
                classified_reviews.append((i, j))
    return classified_reviews

# Get classified reviews based on similarity
classified_reviews_indices = classify_reviews(cosine_sim, threshold_similarity)

# Convert classified reviews indices to DataFrame
classified_reviews_df = pd.DataFrame(classified_reviews_indices, columns=['review_index_1', 'review_index_2'])

# Merge with original DataFrame to get review details
classified_reviews_details = pd.merge(classified_reviews_df, df, left_on='review_index_1', right_index=True)
classified_reviews_details = pd.merge(classified_reviews_details, df, left_on='review_index_2', right_index=True, suffixes=('_1', '_2'))

# Save the classified reviews to a new CSV file
#classified_reviews_details.to_csv('classified_review.csv', index=False)

# Display the number of classified reviews
num_classified_reviews = len(classified_reviews_details)
print(f"Number of Classified Reviews: {num_classified_reviews}")

# Function to count reviews above and below the threshold
def count_reviews_above_below_threshold(similarity_matrix, threshold):
    above_threshold_count = 0
    below_threshold_count = 0
    num_reviews = len(similarity_matrix)
    for i in range(num_reviews):
        for j in range(i+1, num_reviews):
            if similarity_matrix[i][j] >= threshold:
                above_threshold_count += 1
            else:
                below_threshold_count += 1
    return above_threshold_count, below_threshold_count

# Calculate the number of reviews above and below the threshold
above_threshold_count, below_threshold_count = count_reviews_above_below_threshold(cosine_sim, threshold_similarity)

# Print the results
print("Number of reviews above threshold:", above_threshold_count)
print("Number of reviews below threshold:", below_threshold_count)


MemoryError: Unable to allocate 1.37 GiB for an array with shape (366551049,) and data type int32