In [2]:
import pandas as pd
import numpy as np
from textstat import flesch_reading_ease
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics.pairwise import cosine_similarity

Caluculate Readability Score

In [3]:
def add_readability_score(df):
    df['readability_fre'] = df['text'].apply(
        lambda d: flesch_reading_ease(d))

Calculate Rating Category

In [4]:
def add_rating_category(df, threshold):

    def assign_rating_category(rating):
        if rating > threshold:
            return 'positive'
        else:
            return 'negative'

    df['rating_category'] = df['stars'].apply(assign_rating_category)

POS Tagging

In [5]:
def add_pos_tags(df):
    def count_pos(Pos_counts, pos_type):
        pos_count = Pos_counts.get(pos_type, 0)
        return pos_count

    def pos_counts(text):
        doc = nlp(text)
        Pos_counts = doc.count_by(spacy.attrs.POS)
        return Pos_counts

    poscounts =  df['text'].apply(pos_counts)
    df['num_nouns'] = df['text'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.NOUN))
    df['num_verbs'] = df['text'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.VERB))
    df['num_adjectives'] = df['text'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.ADJ))
    df['num_adverbs'] = df['text'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.ADV))


Maximum Cosine Similarity with another review

In [6]:
def add_max_similarity(df):
    tfidfvectoriser = TfidfVectorizer()
    tfidf_matrix = tfidfvectoriser.fit_transform(df['text'])

    cosine_similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    max_similarities = []
    for i, row in enumerate(cosine_similarity_matrix):
        max_similarity = max(row[:i].tolist() + row[i+1:].tolist())
        max_similarities.append(max_similarity)

    df['max_similarity'] = max_similarities


Average Word Length

In [7]:
def add_avg_word_length(df):
    def calculate_average_word_length(text):
        words = text.split()
        total_word_length = sum(len(word) for word in words)
        average_word_length = total_word_length / \
            len(words) if len(words) > 0 else 0
        return average_word_length 
    
    df['avg_word_length'] = df['text'].apply(calculate_average_word_length)


In [8]:
def preprocess_features(df):
    add_readability_score(df)
    add_rating_category(df, threshold=3.0)
    add_pos_tags(df)
    add_avg_word_length(df)

Text Preproess

In [9]:
from nltk.stem import PorterStemmer
import spacy
nlp = spacy.load('en_core_web_sm')
stemmer = PorterStemmer()
stop_words = nlp.Defaults.stop_words

In [9]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']


def remove_stopwords(text):
    words = text.lower().split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])


def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

def preprocess_text(text):
    nlp = spacy.load('en_core_web_sm')
    stemmer = PorterStemmer()
    stop_words = nlp.Defaults.stop_words

    words = text.lower().split()
    return " ".join([token.lemma_ for token in nlp(" ".join([stemmer.stem(word) for word in words if word not in stop_words]))])


In [15]:
df = pd.read_csv('CleanedDataset.csv')
# df['text'] = df['text'].apply(preprocess_text("Nestled into the end of a strip mall in Montecito, Sakana is a true hidden gem of the locals scene in Santa Barbara. Pulling up to this unassuming and intimate restaurant right next to a Vons, you'll find a dim-lit and charming interior boasting incredibly creative sushi with picture-perfect presentations. \n\nLike Nicolette Sheridan, this is what we call a sure thing.\n\nFirst, the menu is huge. If you're a strict traditionalist then you'll be disappointed since their spin on sushi is quite novel; combining fish, textures and flavors in ways I had never before seen. We ordered the Surfer's roll...twice...maybe three times because every time it hit the table everyone began using their chopsticks as weapons of war. We went on to devour the Red Dragon and Montecito rolls with impatient precision. The ahi tuna appetizer is also worth noting. The sushi is served in press box form so you won't find any seaweed holding things together or impacting the overall taste. Every dish is positioned like artwork with splashes of colorful sauces striped across and as you eat and dip the image on the plate transforms. \n\nIt's not cheap but it's also completely worth the price tag. Plus, they are cool with BYOB so our feast for four ended at under $40 each. For a bonus, see if you can convince your dining companion that the HD fish screen is actually a live feed of the tank in the back kitchen. The fish is fresh enough for it to be convincing..."))
preprocess_features(df)
print(df.columns)
print(df.head())

Index(['_id', 'user_id', 'name', 'review_count', 'useful', 'average_stars',
       'review_id', 'stars', 'useful_review', 'text', 'date', 'time',
       'readability_fre', 'rating_category', 'num_nouns', 'num_verbs',
       'num_adjectives', 'num_adverbs', 'avg_word_length'],
      dtype='object')
                        _id                 user_id    name  review_count  \
0  65ddcc66fa4711915dcb2fbd  qVc8ODYU5SZjKXVBgXdI7w  Walker           585   
1  65ddcc66fa4711915dcb2fbe  qVc8ODYU5SZjKXVBgXdI7w  Walker           585   
2  65ddcc66fa4711915dcb2fbf  j14WgRoU_-2ZE1aw1dXrJg  Daniel          4333   
3  65ddcc66fa4711915dcb2fc0  j14WgRoU_-2ZE1aw1dXrJg  Daniel          4333   
4  65ddcc66fa4711915dcb2fc1  j14WgRoU_-2ZE1aw1dXrJg  Daniel          4333   

   useful  average_stars               review_id  stars  useful_review  \
0    7217           3.91  Egy2a4qZeXGr2aY6KMxxbg      5              0   
1    7217           3.91  01vN0q6aMlFio6HAjLZz7Q      5             30   
2   43091       

In [1]:
import pandas as pd
df = pd.read_csv('btp.csv')
print(df.columns)

Index(['_id', 'user_id', 'name', 'review_count', 'useful', 'review_id',
       'stars', 'useful_review', 'text', 'date', 'time'],
      dtype='object')

In [4]:
import pandas as pd

# Read the CSV file
data = pd.read_csv('btp.csv')

# Calculate the length of the text for each user ID
data['text_length'] = data['text'].apply(len)

average_text_length = data['text_length'].mean()

threshold = average_text_length

above_threshold_count = (data['text_length'] > threshold).sum()

below_threshold_count = (data['text_length'] <= threshold).sum()

print("Average Text Length:", average_text_length)
print("Threshold Value:", threshold)
print("Number of User IDs with Text Length Above Threshold:", above_threshold_count)
print("Number of User IDs with Text Length Below or Equal to Threshold:", below_threshold_count)


Average Text Length: 718.2507084481841
Threshold Value: 718.2507084481841
Number of User IDs with Text Length Above Threshold: 38211
Number of User IDs with Text Length Below or Equal to Threshold: 65184
