In [68]:
import spacy
import pandas as pd
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease
from sklearn.preprocessing import LabelEncoder

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv("Datasets/Labelled Yelp Dataset.csv")

In [4]:
df.columns

Index(['User_id', 'Product_id', 'Rating', 'Date', 'Review', 'Label'], dtype='object')

In [5]:
for col in df.columns:
    df[col.upper()] = df[col]
    del df[col]

In [6]:
df['REVIEW_TEXT'] = df['REVIEW']
del df['REVIEW']

In [7]:
df.columns

Index(['USER_ID', 'PRODUCT_ID', 'RATING', 'DATE', 'LABEL', 'REVIEW_TEXT'], dtype='object')

In [8]:
df.to_csv("Datasets/Labelled Yelp Dataset.csv")

In [7]:
df.head()

Unnamed: 0,USER_ID,PRODUCT_ID,RATING,DATE,REVIEW,LABEL
0,923,0,3,12/8/2014,The food at snack is a selection of popular Gr...,-1
1,924,0,3,5/16/2013,This little place in Soho is wonderful. I had ...,-1
2,925,0,4,7/1/2013,ordered lunch for 15 from Snack last Friday. Â...,-1
3,926,0,4,7/28/2011,This is a beautiful quaint little restaurant o...,-1
4,927,0,4,11/1/2010,Snack is great place for a Â casual sit down l...,-1


# df

In [8]:
df1 = pd.read_csv("Datasets/Labelled Yelp Dataset.csv")

In [9]:
df1.head()

Unnamed: 0,User_id,Product_id,Rating,Date,Review,Label
0,923,0,3,12/8/2014,The food at snack is a selection of popular Gr...,-1
1,924,0,3,5/16/2013,This little place in Soho is wonderful. I had ...,-1
2,925,0,4,7/1/2013,ordered lunch for 15 from Snack last Friday. Â...,-1
3,926,0,4,7/28/2011,This is a beautiful quaint little restaurant o...,-1
4,927,0,4,11/1/2010,Snack is great place for a Â casual sit down l...,-1


In [10]:
df['PRODUCT_ID'].value_counts()

PRODUCT_ID
247    7378
555    6632
542    4716
465    3938
468    3143
       ... 
159      16
390      16
747      15
217      13
94       11
Name: count, Length: 923, dtype: int64

In [9]:
#AVERAGE RATING OF PRODUCT
def add_average_rating(df):
    average_ratings = df.groupby('PRODUCT_ID')['RATING'].mean()
    df['AVERAGE_RATING'] = df['PRODUCT_ID'].map(average_ratings)
add_average_rating(df)

In [10]:
def add_rating_deviation(df):
    df['RATING_DEVIATION'] = abs(df['RATING'] - df['AVERAGE_RATING'])
add_rating_deviation(df)

In [11]:
def add_total_reviews(df):
    num_reviews = df.groupby('PRODUCT_ID').size()
    df['TOTAL_PRODUCT_REVIEWS'] = df['PRODUCT_ID'].map(num_reviews)
add_total_reviews(df)

In [12]:
def add_review_length(df):
    df['REVIEW_LENGTH'] = df['REVIEW_TEXT'].apply(lambda d: len(d))
add_review_length(df)

In [None]:
def add_vader_sentiment_score(df):
    sid = SentimentIntensityAnalyzer()

    df['SENTIMENT_SCORE'] = df['REVIEW_TEXT'].apply(
        lambda d: sid.polarity_scores(d)['compound'])
add_vader_sentiment_score(df)

In [None]:
def add_readability_score(df):
    df['READABILITY_FRE'] = df['REVIEW_TEXT'].apply(
        lambda d: flesch_reading_ease(d))
add_readability_score(df)

In [19]:
def add_pos_tags(df):
    def count_pos(Pos_counts, pos_type):
        pos_count = Pos_counts.get(pos_type, 0)
        return pos_count

    def pos_counts(text):
        doc = nlp(text)
        Pos_counts = doc.count_by(spacy.attrs.POS)
        return Pos_counts

    poscounts =  df['REVIEW_TEXT'].apply(pos_counts)
    df['NUM_NOUNS'] = df['REVIEW_TEXT'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.NOUN))
    df['NUM_VERBS'] = df['REVIEW_TEXT'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.VERB))
    df['NUM_ADJECTIVES'] = df['REVIEW_TEXT'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.ADJ))
    df['NUM_ADVERBS'] = df['REVIEW_TEXT'].apply(
        lambda text: count_pos(poscounts, spacy.parts_of_speech.ADV))
add_pos_tags(df)

KeyboardInterrupt: 

In [13]:
#ADD TOTAL REVIEWS BY USER
def add_total_user_reviews(df):
    df['TOTAL_USER_REVIEWS'] = df.groupby('USER_ID')['REVIEW_ID'].transform('count')
add_total_user_reviews(df)

KeyError: 'Column not found: REVIEW_ID'

In [14]:
#ADD RATING CATEGORY
def add_rating_category(df, threshold):

    def assign_rating_category(rating):
        if rating > threshold:
            return 1
        else:
            return 0

    df['RATING_CATEGORY'] = df['RATING'].apply(assign_rating_category)
add_rating_category(df, 2.5)

In [None]:
def add_coherence_column(df):
    df['COHERENT'] = 1 if df['SENTIMENT_CATEGORY'] == df['RATING_CATEGORY'] else 0
add_coherence_column(df)

In [15]:
#BINARY: USER GIVES ONLY ONE KIND OF REVIEW
df['SINGLE_RATING_CATEGORY'] = df.groupby('USER_ID')['RATING_CATEGORY'].transform(lambda x: 1 if x.nunique() == 1 else 0)

In [16]:
df.columns

Index(['USER_ID', 'PRODUCT_ID', 'RATING', 'DATE', 'LABEL', 'REVIEW_TEXT',
       'AVERAGE_RATING', 'RATING_DEVIATION', 'TOTAL_PRODUCT_REVIEWS',
       'REVIEW_LENGTH', 'RATING_CATEGORY', 'SINGLE_RATING_CATEGORY'],
      dtype='object')

In [18]:
df.to_csv("Datasets/Labelled Yelp Dataset.csv")

In [26]:
#Posted in same day as many (3) other reviews on product
df['DATE'][0]

#CONVERT DATE TO DATETIME OBJECTS
df['DATE'] = pd.to_datetime(df['DATE'], infer_datetime_format=True)

#COUNT OF REVIEWS WITH SAME PRODUCT_ID AND SAME DATE
df['REVIEW_COUNT_DATE'] = df.groupby(['PRODUCT_ID', 'DATE'])['PRODUCT_ID'].transform('count')

#BINARY COLUMN: IF POSTED ON A PRODUCT WITH >3 REVIEWS IN THAT DATE
df['SAME_DATE_MULTIPLE_REVIEWS'] = (df['REVIEW_COUNT_DATE'] > 2).astype(int)

  df['DATE'] = pd.to_datetime(df['DATE'], infer_datetime_format=True)


In [27]:
df.head()

Unnamed: 0,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,REVIEW_LENGTH,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS
0,923,0,3,2014-12-08,-1,The food at snack is a selection of popular Gr...,4.009524,1.009524,210,215,1,0,1,0
1,924,0,3,2013-05-16,-1,This little place in Soho is wonderful. I had ...,4.009524,1.009524,210,269,1,1,1,0
2,925,0,4,2013-07-01,-1,ordered lunch for 15 from Snack last Friday. Â...,4.009524,0.009524,210,180,1,1,1,0
3,926,0,4,2011-07-28,-1,This is a beautiful quaint little restaurant o...,4.009524,0.009524,210,493,1,1,1,0
4,927,0,4,2010-11-01,-1,Snack is great place for a Â casual sit down l...,4.009524,0.009524,210,601,1,0,2,0


In [29]:
df['SAME_DATE_MULTIPLE_REVIEWS'].value_counts()

SAME_DATE_MULTIPLE_REVIEWS
0    302567
1     56485
Name: count, dtype: int64

In [32]:
#BINARY: ONLY REVIEW OF PRODUCT
df['TOTAL_PRODUCT_REVIEWS']= df.groupby('PRODUCT_ID')['PRODUCT_ID'].transform('count')

In [33]:
df['TOTAL_PRODUCT_REVIEWS'].value_counts()

TOTAL_PRODUCT_REVIEWS
7378    7378
6632    6632
4716    4716
3938    3938
3143    3143
        ... 
25        25
22        22
15        15
13        13
11        11
Name: count, Length: 540, dtype: int64

In [35]:
#MAXIMUM NUMBER OF REVIEWS BY A USER IN ONE DAY
df['MAX_USER_REVIEWS_DAY'] = df.groupby(['USER_ID', df['DATE'].dt.date])['USER_ID'].transform('count')

In [36]:
df.head()

Unnamed: 0,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,REVIEW_LENGTH,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY
0,923,0,3,2014-12-08,-1,The food at snack is a selection of popular Gr...,4.009524,1.009524,210,215,1,0,1,0,2
1,924,0,3,2013-05-16,-1,This little place in Soho is wonderful. I had ...,4.009524,1.009524,210,269,1,1,1,0,1
2,925,0,4,2013-07-01,-1,ordered lunch for 15 from Snack last Friday. Â...,4.009524,0.009524,210,180,1,1,1,0,2
3,926,0,4,2011-07-28,-1,This is a beautiful quaint little restaurant o...,4.009524,0.009524,210,493,1,1,1,0,1
4,927,0,4,2010-11-01,-1,Snack is great place for a Â casual sit down l...,4.009524,0.009524,210,601,1,0,2,0,1


In [37]:
#Timestamp difference between last and first review of reviewer
grouped = df.groupby('USER_ID')['DATE']
df['TIMESTAMP_DIFFERENCE'] = grouped.transform(lambda x: x.max() - x.min())

In [38]:
#Total number of purchased distinct products
df.to_csv("Datasets/Labelled Yelp Dataset.csv")

In [39]:
df['AVERAGE_USER_REVIEW_LENGTH'] = df.groupby('USER_ID')['REVIEW_LENGTH'].transform('mean')


In [40]:
df.head()

Unnamed: 0,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,REVIEW_LENGTH,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY,TIMESTAMP_DIFFERENCE,AVERAGE_USER_REVIEW_LENGTH
0,923,0,3,2014-12-08,-1,The food at snack is a selection of popular Gr...,4.009524,1.009524,210,215,1,0,1,0,2,399 days,350.564103
1,924,0,3,2013-05-16,-1,This little place in Soho is wonderful. I had ...,4.009524,1.009524,210,269,1,1,1,0,1,0 days,269.0
2,925,0,4,2013-07-01,-1,ordered lunch for 15 from Snack last Friday. Â...,4.009524,0.009524,210,180,1,1,1,0,2,0 days,189.5
3,926,0,4,2011-07-28,-1,This is a beautiful quaint little restaurant o...,4.009524,0.009524,210,493,1,1,1,0,1,0 days,493.0
4,927,0,4,2010-11-01,-1,Snack is great place for a Â casual sit down l...,4.009524,0.009524,210,601,1,0,2,0,1,51 days,827.8


In [41]:
#TOTAL REVIEWS BY USER
df['TOTAL_USER_REVIEWS'] = df.groupby('USER_ID')['USER_ID'].transform('count')


In [42]:
#PERCENTAGE OF POSITIVE REVIEWS BY USER
df['PERCENTAGE_POSITIVE_REVIEWS'] = df.groupby('USER_ID')['RATING_CATEGORY'].transform(lambda x: (x == 1).mean() * 100)

In [43]:
df.columns

Index(['USER_ID', 'PRODUCT_ID', 'RATING', 'DATE', 'LABEL', 'REVIEW_TEXT',
       'AVERAGE_RATING', 'RATING_DEVIATION', 'TOTAL_PRODUCT_REVIEWS',
       'REVIEW_LENGTH', 'RATING_CATEGORY', 'SINGLE_RATING_CATEGORY',
       'REVIEW_COUNT_DATE', 'SAME_DATE_MULTIPLE_REVIEWS',
       'MAX_USER_REVIEWS_DAY', 'TIMESTAMP_DIFFERENCE',
       'AVERAGE_USER_REVIEW_LENGTH', 'TOTAL_USER_REVIEWS',
       'PERCENTAGE_POSITIVE_REVIEWS'],
      dtype='object')

In [48]:
#RATIO OF POSITIVE/NEGATIVE REVIEWS GIVEN TO PRODUCT
df['RATIO_POSITIVE_NEGATIVE'] = df.groupby('PRODUCT_ID')['RATING_CATEGORY'].transform(lambda x: (x == 1).sum() / (x == 0).sum())


  df['RATIO_POSITIVE_NEGATIVE'] = df.groupby('PRODUCT_ID')['RATING_CATEGORY'].transform(lambda x: (x == 1).sum() / (x == 0).sum())


In [56]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# Assuming you have a feature matrix X and target vector y
model = RandomForestRegressor()
model.fit(X, y)

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure()
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), indices)
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()


NameError: name 'np' is not defined

In [49]:
df.to_csv("Datasets/Labelled Yelp Dataset.csv")

In [50]:
len(df)

359052

In [52]:
df.drop_duplicates()

Unnamed: 0,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,REVIEW_LENGTH,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY,TIMESTAMP_DIFFERENCE,AVERAGE_USER_REVIEW_LENGTH,TOTAL_USER_REVIEWS,PERCENTAGE_POSITIVE_REVIEWS,RATIO_POSITIVE_NEGATIVE
0,923,0,3,2014-12-08,-1,The food at snack is a selection of popular Gr...,4.009524,1.009524,210,215,1,0,1,0,2,399 days,350.564103,39,97.435897,10.666667
1,924,0,3,2013-05-16,-1,This little place in Soho is wonderful. I had ...,4.009524,1.009524,210,269,1,1,1,0,1,0 days,269.000000,1,100.000000,10.666667
2,925,0,4,2013-07-01,-1,ordered lunch for 15 from Snack last Friday. Â...,4.009524,0.009524,210,180,1,1,1,0,2,0 days,189.500000,2,100.000000,10.666667
3,926,0,4,2011-07-28,-1,This is a beautiful quaint little restaurant o...,4.009524,0.009524,210,493,1,1,1,0,1,0 days,493.000000,1,100.000000,10.666667
4,927,0,4,2010-11-01,-1,Snack is great place for a Â casual sit down l...,4.009524,0.009524,210,601,1,0,2,0,1,51 days,827.800000,5,80.000000,10.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359047,161146,349,5,2014-02-06,1,"I'm very spoiled with Pizza. Really, I have tr...",4.185185,0.814815,108,1441,1,1,1,0,1,0 days,1441.000000,1,100.000000,5.750000
359048,116424,349,5,2014-01-31,1,Can't say enough good things about this place....,4.185185,0.814815,108,475,1,1,1,0,1,331 days,315.000000,2,100.000000,5.750000
359049,161147,349,5,2014-01-30,1,"Had a great dinner here- fantastic pizza, the ...",4.185185,0.814815,108,268,1,1,1,0,1,0 days,268.000000,1,100.000000,5.750000
359050,97930,349,5,2014-01-25,1,"Great foods and great drinks, they have even p...",4.185185,0.814815,108,150,1,1,2,0,1,136 days,254.000000,2,100.000000,5.750000


In [53]:
df.dropna()

Unnamed: 0,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,REVIEW_LENGTH,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY,TIMESTAMP_DIFFERENCE,AVERAGE_USER_REVIEW_LENGTH,TOTAL_USER_REVIEWS,PERCENTAGE_POSITIVE_REVIEWS,RATIO_POSITIVE_NEGATIVE
0,923,0,3,2014-12-08,-1,The food at snack is a selection of popular Gr...,4.009524,1.009524,210,215,1,0,1,0,2,399 days,350.564103,39,97.435897,10.666667
1,924,0,3,2013-05-16,-1,This little place in Soho is wonderful. I had ...,4.009524,1.009524,210,269,1,1,1,0,1,0 days,269.000000,1,100.000000,10.666667
2,925,0,4,2013-07-01,-1,ordered lunch for 15 from Snack last Friday. Â...,4.009524,0.009524,210,180,1,1,1,0,2,0 days,189.500000,2,100.000000,10.666667
3,926,0,4,2011-07-28,-1,This is a beautiful quaint little restaurant o...,4.009524,0.009524,210,493,1,1,1,0,1,0 days,493.000000,1,100.000000,10.666667
4,927,0,4,2010-11-01,-1,Snack is great place for a Â casual sit down l...,4.009524,0.009524,210,601,1,0,2,0,1,51 days,827.800000,5,80.000000,10.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359047,161146,349,5,2014-02-06,1,"I'm very spoiled with Pizza. Really, I have tr...",4.185185,0.814815,108,1441,1,1,1,0,1,0 days,1441.000000,1,100.000000,5.750000
359048,116424,349,5,2014-01-31,1,Can't say enough good things about this place....,4.185185,0.814815,108,475,1,1,1,0,1,331 days,315.000000,2,100.000000,5.750000
359049,161147,349,5,2014-01-30,1,"Had a great dinner here- fantastic pizza, the ...",4.185185,0.814815,108,268,1,1,1,0,1,0 days,268.000000,1,100.000000,5.750000
359050,97930,349,5,2014-01-25,1,"Great foods and great drinks, they have even p...",4.185185,0.814815,108,150,1,1,2,0,1,136 days,254.000000,2,100.000000,5.750000


In [2]:
df_random = df.sample(n=20000, random_state=42)

NameError: name 'df' is not defined

In [55]:
df_random.to_csv("Datasets/Yelp Dataset Reduced.csv")

In [57]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [58]:
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',
                                                   objective='binary:logistic',
                                                   use_label_encoder=False
                                                   )})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
#classifiers.update({"MultinomialNB": MultinomialNB()})
classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})



In [1]:
X = df_random[
    ['RATING',
       'AVERAGE_RATING', 'RATING_DEVIATION', 'TOTAL_PRODUCT_REVIEWS',
       'REVIEW_LENGTH', 'RATING_CATEGORY', 'SINGLE_RATING_CATEGORY',
       'REVIEW_COUNT_DATE', 'SAME_DATE_MULTIPLE_REVIEWS',
       'MAX_USER_REVIEWS_DAY',
       'AVERAGE_USER_REVIEW_LENGTH', 'TOTAL_USER_REVIEWS',
       'PERCENTAGE_POSITIVE_REVIEWS']
]
Y = df_random['LABEL']

NameError: name 'df_random' is not defined

In [69]:
le = LabelEncoder()
Y = le.fit_transform(Y)

In [70]:
# PERFORM THE TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

# FEATURE SCALINGf
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [None]:
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)
