rating

In [8]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('CleanedDataset.csv')

# Calculate the combined rating of all users based on 'stars' column
combined_rating = df['stars'].mean()

# Determine the threshold rating for below threshold and above threshold
threshold_rating_below = 0.4 * combined_rating
threshold_rating_above = 1.3 * combined_rating

# Filter reviews based on the condition: rating < threshold rating
reviews_below_threshold = df[df['stars'] < threshold_rating_below]

# Filter reviews based on the condition: rating > threshold rating
reviews_above_threshold = df[df['stars'] > threshold_rating_above]

# Concatenate both DataFrames
all_filtered_reviews = pd.concat([reviews_below_threshold, reviews_above_threshold])

# Store the filtered reviews in a single CSV file
all_filtered_reviews.to_csv('all_filtered_reviews.csv', index=False)

# Display the threshold rating and the number of reviews in the combined file
num_reviews_combined = len(all_filtered_reviews)
print(f"Threshold Rating for Below: {threshold_rating_below}")
print(f"Threshold Rating for Above: {threshold_rating_above}")
print(f"Number of Reviews in Combined File: {num_reviews_combined}")


Threshold Rating for Below: 1.5346785546691695
Threshold Rating for Above: 4.987705302674801
Number of Reviews in Combined File: 6875


In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('CleanedDataset.csv')

# Calculate the combined rating of all users based on 'stars' column
combined_rating = df['stars'].mean()

# Determine the threshold rating for below threshold and above threshold
threshold_rating_below = 0.4 * combined_rating
threshold_rating_above = 1.3 * combined_rating

# Filter reviews based on the condition: rating < threshold rating and review length < 10
reviews_below_threshold = df[(df['stars'] < threshold_rating_below) & (df['text'].str.split().str.len() < 10)]

# Filter reviews based on the condition: rating > threshold rating and review length < 10
reviews_above_threshold = df[(df['stars'] > threshold_rating_above) & (df['text'].str.split().str.len() < 10)]

# Concatenate both DataFrames
all_filtered_reviews = pd.concat([reviews_below_threshold, reviews_above_threshold])

# Store the filtered reviews in a single CSV file
all_filtered_reviews.to_csv('all_filtered_reviews.csv', index=False)

# Display the threshold rating and the number of reviews in the combined file
num_reviews_combined = len(all_filtered_reviews)
print(f"Threshold Rating for Below: {threshold_rating_below}")
print(f"Threshold Rating for Above: {threshold_rating_above}")
print(f"Number of Reviews in Combined File: {num_reviews_combined}")


Threshold Rating for Below: 1.5346785546691695
Threshold Rating for Above: 4.987705302674801
Number of Reviews in Combined File: 30


timeframe

In [12]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('CleanedDataset.csv')

# Filter the reviews within the specified time frame (00:00 to 6:00)
reviews_within_timeframe = df[(df['time'] >= '00:00:00') & (df['time'] < '06:00:00')].copy()

# Convert the 'time' column to datetime format
reviews_within_timeframe['time'] = pd.to_datetime(reviews_within_timeframe['time'], format='%H:%M:%S')
 
#Group the reviews by 'user_id' and count the number of reviews for each user within the time frame

user_reviews_within_timeframe = reviews_within_timeframe.groupby('user_id').size()

# Calculate the combined number of reviews within the time frame

combined_reviews_within_timeframe = user_reviews_within_timeframe.mean()

# Calculate the average time difference between consecutive reviews for each user within the time frame
reviews_within_timeframe = reviews_within_timeframe.sort_values(by=['user_id', 'time'])
reviews_within_timeframe['time_diff'] = reviews_within_timeframe.groupby('user_id')['time'].diff().dt.total_seconds().fillna(0)
user_avg_time_diff = reviews_within_timeframe.groupby('user_id')['time_diff'].mean()
combined_avg_time_diff = user_avg_time_diff.mean()

# Determine the threshold based on the combined average number of reviews and average time difference within the time frame

weight_reviews = 0.5

weight_time_diff = 0.5

threshold_reviews_within_timeframe = (weight_reviews * combined_reviews_within_timeframe + weight_time_diff * combined_avg_time_diff)

# Classify the reviews based on whether the user's number of reviews within the time frame exceeds the threshold

classified_reviews = df[df['user_id'].map(user_reviews_within_timeframe) > threshold_reviews_within_timeframe]

# Store the classified reviews in a new CSV file

classified_reviews.to_csv('classified_reviews.csv', index=False)

# Display the threshold and the number of classified reviews

num_classified_reviews = len(classified_reviews)

print(f"Threshold Reviews within 00:00 to 6:00 Time Frame: {threshold_reviews_within_timeframe}")

print(f"Number of Classified Reviews: {num_classified_reviews}")


Threshold Reviews within 00:00 to 6:00 Time Frame: 450.2482226060666
Number of Classified Reviews: 0


similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('CleanedDataset.csv')

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define the threshold similarity value
threshold_similarity = 0.7  # You can adjust this value based on your requirements

# Function to classify reviews based on similarity
def classify_reviews(similarity_matrix, threshold):
    classified_reviews = []
    num_reviews = len(similarity_matrix)
    for i in range(num_reviews):
        for j in range(i+1, num_reviews):
            if similarity_matrix[i][j] >= threshold:
                classified_reviews.append((i, j))
    return classified_reviews

# Get classified reviews based on similarity
classified_reviews_indices = classify_reviews(cosine_sim, threshold_similarity)

# Convert classified reviews indices to DataFrame
classified_reviews_df = pd.DataFrame(classified_reviews_indices, columns=['review_index_1', 'review_index_2'])

# Merge with original DataFrame to get review details
classified_reviews_details = pd.merge(classified_reviews_df, df, left_on='review_index_1', right_index=True)
classified_reviews_details = pd.merge(classified_reviews_details, df, left_on='review_index_2', right_index=True, suffixes=('_1', '_2'))

# Save the classified reviews to a new CSV file
classified_reviews_details.to_csv('classified_review.csv', index=False)

# Display the number of classified reviews
num_classified_reviews = len(classified_reviews_details)
print(f"Number of Classified Reviews: {num_classified_reviews}")


In [2]:
pip install textblob


Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
   -- ------------------------------------ 41.0/626.3 kB 393.8 kB/s eta 0:00:02
   ----- --------------------------------- 92.2/626.3 kB 655.4 kB/s eta 0:00:01
   -------- ----------------------------- 143.4/626.3 kB 853.3 kB/s eta 0:00:01
   --------------- ------------------------ 245.8/626.3 kB 1.1 MB/s eta 0:00:01
   ------------------ --------------------- 286.7/626.3 kB 1.0 MB/s eta 0:00:01
   ----------------------- ---------------- 368.6/626.3 kB 1.1 MB/s eta 0:00:01
   ------------------------------ --------- 471.0/626.3 kB 1.3 MB/s eta 0:00:01
   ------------------------------ --------- 471.0/626.3 kB 1.3 MB/s eta 0:0

In [3]:
from textblob import TextBlob
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('CleanedDataset.csv')

# Define a function to calculate sentiment score using TextBlob
def calculate_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    return sentiment_score

# Apply the function to calculate sentiment scores for each review text
df['sentiment_score'] = df['text'].apply(calculate_sentiment)

# Display the DataFrame with sentiment scores
print(df[['text', 'sentiment_score']])


                                                    text  sentiment_score
0      Remarkable food with beach access for the whol...         0.405455
1      I loved everything about this lovely train sta...         0.598016
2      The Praline Connection makes a mean po' boy.  ...         0.024826
3      We walked over to Tennessee Brew Works, one of...         0.144998
4      Logan Circle, also known as Logan Square, is a...         0.140657
...                                                  ...              ...
19174  Worst Burger King on Earth. There will be some...        -0.260000
19175  Reliable classics and good take-out service.  ...         0.350000
19176  I ordered the whiting, candied yams and fries....         0.268056
19177  Fast, friendly, delicious, and affordable! Bes...         0.643750
19178  Suzie Hot Sauce is a great place -- if you lik...         0.084509

[19179 rows x 2 columns]
