# TF-IDF

In [None]:
import time

# Record the start time
start_time = time.time()

In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from scipy import stats
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings("ignore")
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Data Preprocessing

In [None]:
file_name='labeled_data_processed.xlsx'
df=pd.read_excel(file_name)
# delete rows having null cleaned_text
df=df[df['cleaned_text'].isna()==False]
# delete rows with null labels
labels=['food_quality','environment','service','convenience','cost_effectiveness']
df = df[df[labels].notna().all(axis=1)]
df.head()

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars_x,useful_x,funny_x,cool_x,text,date_x,...,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,cleaned_text
0,0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",43288.92292,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,decide eat aware going take 2 hours beginning ...
1,2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,41675.85417,...,3.0,1.0,1.0,63.0,96.0,86.0,86.0,49.0,27.0,family diner buffet eclectic assortment large ...
2,3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",42008.00069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,wow yummy different delicious favorite lamb cu...
3,4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,42749.87083,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,cute interior owner gave us tour upcoming pati...
4,5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,42270.96528,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,long term frequent customer establishment went...


In [None]:
def preprocess_text(text):
        # segment text into word tokens
        tokens=word_tokenize(text)

        # stemming
        porter_stemmer  = PorterStemmer()
        stemmed_text=[porter_stemmer.stem(word) for word in tokens]

        return ' '.join(stemmed_text)

In [None]:
# transfer to tf-idf
# dataset: could be 'train' if applies on training set, 'test' if applies on test set
# voca only available when applies on test set
def tfidf_transform(df,column,dataset='train',voca=['default']):
    # smooth the tf and idf(default)
    corpus=df[column].tolist()
    if dataset == 'train':
        vector=TfidfVectorizer(preprocessor=preprocess_text,sublinear_tf = True)
        tfidf=vector.fit_transform(corpus)
        vocab=vector.get_feature_names_out()
        return tfidf,vocab
    elif dataset == 'test':
        vector=TfidfVectorizer(vocabulary=voca,preprocessor=preprocess_text,sublinear_tf = True)
        tfidf=vector.fit_transform(corpus)
        vocab=vector.get_feature_names_out()
        return tfidf,vocab

Split the dataset to training set(80%) and test set

In [None]:
# split the dataset without oversampling
def CV(df, y_label,ratio=0.8):
    train_indices = df.sample(frac=ratio, random_state=200).index
    x_train,dic=tfidf_transform(df.loc[train_indices].reset_index(drop=True),'cleaned_text')
    x_test,dic=tfidf_transform(df.drop(train_indices).reset_index(drop=True),'cleaned_text','test',dic)
    y_train=df[y_label][train_indices]
    y_test=df[y_label].drop(train_indices)

    return x_train, x_test,y_train,y_test

In [None]:
x_train, x_test,y_train,y_test=CV(df,'food_quality')

In [None]:
# use smote to balance the dataset
def CV_SMOTE(df, y_label,ratio=0.8):
    try:
        sm = SMOTE(random_state=42)

        train_indices = df.sample(frac=ratio, random_state=200).index
        x_train_pre,dic=tfidf_transform(df.loc[train_indices].reset_index(drop=True),'cleaned_text')
        x_test_pre,dic=tfidf_transform(df.drop(train_indices).reset_index(drop=True),'cleaned_text','test',dic)
        y_train_pre=df[y_label][train_indices]
        y_test_pre=df[y_label].drop(train_indices)

        x_train,y_train=sm.fit_resample(x_train_pre,y_train_pre)
        x_test,y_test=sm.fit_resample(x_test_pre,y_test_pre)
    except:
        sm = SMOTE(random_state=42,k_neighbors=1)
        train_indices = df.sample(frac=ratio, random_state=200).index
        x_train_pre,dic=tfidf_transform(df.loc[train_indices].reset_index(drop=True),'cleaned_text')
        x_test_pre,dic=tfidf_transform(df.drop(train_indices).reset_index(drop=True),'cleaned_text','test',dic)
        y_train_pre=df[y_label][train_indices]
        y_test_pre=df[y_label].drop(train_indices)

        x_train,y_train=sm.fit_resample(x_train_pre,y_train_pre)
        x_test,y_test=sm.fit_resample(x_test_pre,y_test_pre)

    return x_train, x_test,y_train,y_test

In [None]:
x_train, x_test,y_train,y_test=CV_SMOTE(df,'food_quality')

Classification

In [None]:
def train_models(x_train,y_train,label,model='decision_tree'):
    if model == 'decision_tree':
        clf = DecisionTreeClassifier()
        clf = clf.fit(x_train, y_train)
        return clf,label
    elif model =='naive_bayes':
        clf = BernoulliNB()
        clf.fit(x_train, y_train)
        return clf,label
    elif model =='linear_SGD_classifier':
        clf = SGDClassifier(loss='squared_error',max_iter=5000,eta0=0.001)
        clf.fit(x_train, y_train)
        return clf,label
    elif model =='logistic_regression':
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        clf.fit(x_train, y_train)
        return clf,label
    elif model == 'random_forest':
        clf = RandomForestClassifier()
        clf.fit(x_train, y_train)
        return clf,label
    elif model == 'svc':
        clf = SVC()
        clf.fit(x_train, y_train)
        return clf,label

In [None]:
def test_model(x_test,y_test,trained_model):
    prediction = trained_model.predict(x_test)
    f1 = f1_score(y_test, prediction,average='micro')
    accuracy=accuracy_score(y_test, prediction)
    return f1,accuracy

In [None]:
labels=['food_quality','environment','service','convenience','cost_effectiveness']
models=['decision_tree','naive_bayes','linear_SGD_classifier','logistic_regression','random_forest','svc']
results_f1={}
results_accuracy={}
for label in labels:
    results_f1[label]=[]
    results_accuracy[label]=[]
    x_train,x_test,y_train,y_test=CV_SMOTE(df,label)
    for model_name in models:
        trained_model,label=train_models(x_train,y_train,label,model_name)
        test_result=test_model(x_test,y_test,trained_model)
        results_f1[label].append(test_result[0])
        results_accuracy[label].append(test_result[1])

In [None]:
f1_metric=pd.DataFrame(results_f1,index=models)
f1_metric

Unnamed: 0,food_quality,environment,service,convenience,cost_effectiveness
decision_tree,0.441558,0.590062,0.679128,0.321895,0.5918
naive_bayes,0.619048,0.501035,0.694704,0.333333,0.411765
linear_SGD_classifier,0.320346,0.283644,0.35514,0.163399,0.208556
logistic_regression,0.616883,0.534161,0.838006,0.334967,0.513369
random_forest,0.616883,0.575569,0.800623,0.333333,0.465241
svc,0.545455,0.52588,0.781931,0.333333,0.486631


# Word2Vec

In [None]:
import io
import re
import string
import tqdm

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow as tf
import re
import string
import nltk
import spacy
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC


In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
# Specify the path to your Excel file and the encoding
#path_to_file = "labeled_data_processed.xlsx"

df = pd.read_excel('labeled_data_processed.xlsx')

In [None]:
df = df.dropna(subset=['cleaned_text', 'food_quality', 'environment', 'service', 'convenience', 'cost_effectiveness'])

In [None]:
nltk.download('punkt')  # Download NLTK's punkt tokenizer
nlp = spacy.load("en_core_web_sm")  # Load spaCy's English model
stemmer = PorterStemmer()  # Initialize the Porter Stemmer

def stem_and_lemmatize(text):
    # Tokenize the text using NLTK's punkt tokenizer
    tokens = nltk.word_tokenize(text)

    # Stem each token using NLTK's Porter Stemmer
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Join the stemmed tokens back into a text
    stemmed_text = ' '.join(stemmed_tokens)

    # Lemmatize the text using spaCy
    doc = nlp(stemmed_text)
    lemmatized_tokens = [token.lemma_ for token in doc]

    # Join the lemmatized tokens back into a text
    lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text

df['cleaned_text'] = df['cleaned_text'].apply(stem_and_lemmatize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
selected_columns = ['cleaned_text', 'food_quality', 'environment', 'service', 'convenience', 'cost_effectiveness']
df = df[selected_columns]

In [None]:
X = df['cleaned_text']
y = df[['food_quality', 'environment', 'service', 'convenience', 'cost_effectiveness']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Access the column in the DataFrame that contains the text
text_column = X_train

# Create a TextLineDataset from the text_column
text_ds = tf.data.Dataset.from_tensor_slices(text_column)

# Apply filtering as needed
text_ds = text_ds.filter(lambda x: tf.cast(tf.strings.length(x), bool))

# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 800

# Use the TextVectorization layer with the custom standardization
vectorize_layer = layers.TextVectorization(
    #standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Adapt the layer to the text data
vectorize_layer.adapt(text_ds.batch(1024))


In [None]:
# Save the created vocabulary for reference
inverse_vocab = vectorize_layer.get_vocabulary()

In [None]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [None]:
sequences = list(text_vector_ds.as_numpy_iterator())

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 867/867 [00:08<00:00, 104.00it/s]




targets.shape: (24530,)
contexts.shape: (24530, 5)
labels.shape: (24530, 5)


In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [None]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Set the number of negative samples per positive context.
num_ns = 4

class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [None]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e5ffc63fca0>

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
w2v_dict = {key: value for key, value in zip(vocab, weights)}

In [None]:
def get_vector(data, dictionary):
  embedding=[]
  for sentence in data:
    words = sentence.split()
    values = [dictionary[key] if key in dictionary else 0 for key in words]
    embedding.append(np.mean(values, axis=0))
  return embedding

In [None]:
vector_X_train = get_vector(X_train, w2v_dict)
vector_X_test = get_vector(X_test, w2v_dict)

# maching learning

**Food**

In [None]:
ytrain1 = y_train.iloc[:,0]

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
over_X_train, over_ytrain1 = oversample.fit_resample (vector_X_train, ytrain1)

In [None]:
from sklearn.metrics import f1_score
logistic_regression_model = LogisticRegression()

# Fit the model on the training set
logistic_regression_model.fit(over_X_train, over_ytrain1)

# Predict on the testing set
y_pred = logistic_regression_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:,0], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:,0], y_pred)
classification_rep = classification_report(y_test.iloc[:,0], y_pred,output_dict=True)
macro_f1_loistic_food = classification_rep['macro avg']['f1-score']
micro_f1_loistic_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
# Create an instance of the SGDClassifier
linear_sgd_model = SGDClassifier()

# Fit the model on the training set
linear_sgd_model.fit(over_X_train, over_ytrain1)

# Predict on the testing set
y_pred = linear_sgd_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 0], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 0], y_pred)
classification_rep = classification_report(y_test.iloc[:, 0], y_pred,output_dict=True)
macro_f1_sgd_food = classification_rep['macro avg']['f1-score']
micro_f1_sgd_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
# Create an instance of the RandomForestClassifier
random_forest_model = RandomForestClassifier()

# Fit the model on the training set
random_forest_model.fit(over_X_train, over_ytrain1)

# Predict on the testing set
y_pred = random_forest_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 0], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 0], y_pred)
classification_rep = classification_report(y_test.iloc[:, 0], y_pred,output_dict=True)
macro_f1_rf_food = classification_rep['macro avg']['f1-score']
micro_f1_rf_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
# Create an instance of the RandomForestClassifier
DT_model = tree.DecisionTreeClassifier()

# Fit the model on the training set
DT_model.fit(over_X_train, over_ytrain1)

# Predict on the testing set
y_pred = DT_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 0], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 0], y_pred)
classification_rep = classification_report(y_test.iloc[:, 0], y_pred,output_dict=True)
macro_f1_dt_food = classification_rep['macro avg']['f1-score']
micro_f1_dt_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
# Create an instance of the RandomForestClassifier
NB_model = BernoulliNB()

# Fit the model on the training set
NB_model.fit(over_X_train, over_ytrain1)

# Predict on the testing set
y_pred = NB_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 0], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 0], y_pred)
classification_rep = classification_report(y_test.iloc[:, 0], y_pred,output_dict=True)
macro_f1_nb_food = classification_rep['macro avg']['f1-score']
micro_f1_nb_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
clf = SVC()
clf.fit(over_X_train, over_ytrain1)
y_pred = clf.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 0], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 0], y_pred)
classification_rep = classification_report(y_test.iloc[:, 0], y_pred,output_dict=True)
macro_f1_svc_food = classification_rep['macro avg']['f1-score']
micro_f1_svc_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
macro_f1_food=[macro_f1_nb_food,macro_f1_dt_food,macro_f1_rf_food,macro_f1_sgd_food,macro_f1_loistic_food,macro_f1_svc_food]

**Environ**

In [None]:
ytrain2 = y_train.iloc[:,1]
oversample = SMOTE()
over_X_train, over_ytrain2 = oversample.fit_resample (vector_X_train, ytrain2)

logistic_regression_model = LogisticRegression()

# Fit the model on the training set
logistic_regression_model.fit(over_X_train, over_ytrain2)

# Predict on the testing set
y_pred = logistic_regression_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:,1], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:,1], y_pred)
classification_rep = classification_report(y_test.iloc[:, 1], y_pred,output_dict=True)
macro_f1_logistic_en = classification_rep['macro avg']['f1-score']
micro_f1_loistic_food = f1_score(y_test.iloc[:,0], y_pred, average = 'micro')

In [None]:
# Create an instance of the SGDClassifier
linear_sgd_model = SGDClassifier()

# Fit the model on the training set
linear_sgd_model.fit(over_X_train, over_ytrain2)

# Predict on the testing set
y_pred = linear_sgd_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 1], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 1], y_pred)
classification_rep = classification_report(y_test.iloc[:, 1], y_pred,output_dict=True)
macro_f1_sgd_en = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
random_forest_model = RandomForestClassifier()

# Fit the model on the training set
random_forest_model.fit(over_X_train, over_ytrain2)

# Predict on the testing set
y_pred = random_forest_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 1], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 1], y_pred)
classification_rep = classification_report(y_test.iloc[:, 1], y_pred,output_dict=True)
macro_f1_rf_en = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
DT_model = tree.DecisionTreeClassifier()

# Fit the model on the training set
DT_model.fit(over_X_train, over_ytrain2)

# Predict on the testing set
y_pred = DT_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 1], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 1], y_pred)
classification_rep = classification_report(y_test.iloc[:, 1], y_pred,output_dict=True)
macro_f1_dt_en = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
NB_model = BernoulliNB()

# Fit the model on the training set
NB_model.fit(over_X_train, over_ytrain2)

# Predict on the testing set
y_pred = NB_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 1], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 1], y_pred)
classification_rep = classification_report(y_test.iloc[:, 1], y_pred,output_dict=True)
macro_f1_nb_en = classification_rep['macro avg']['f1-score']

In [None]:
clf = SVC()
clf.fit(over_X_train, over_ytrain2)
y_pred = clf.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 1], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 1], y_pred)
classification_rep = classification_report(y_test.iloc[:, 1], y_pred,output_dict=True)
macro_f1_svc_en = classification_rep['macro avg']['f1-score']

In [None]:
macro_f1_en=[macro_f1_nb_en,macro_f1_dt_en,macro_f1_rf_en,macro_f1_sgd_en,macro_f1_logistic_en,macro_f1_svc_en]

**Service**

In [None]:
ytrain3 = y_train.iloc[:,2]
oversample = SMOTE()
over_X_train, over_ytrain3 = oversample.fit_resample (vector_X_train, ytrain3)

# Create an instance of the SGDClassifier
linear_sgd_model = SGDClassifier()

# Fit the model on the training set
linear_sgd_model.fit(over_X_train, over_ytrain3)
# Create an instance of the SGDClassifier
linear_sgd_model = SGDClassifier()

# Fit the model on the training set
linear_sgd_model.fit(over_X_train, over_ytrain3)

# Predict on the testing set
y_pred = linear_sgd_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 2], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 2], y_pred)
classification_rep = classification_report(y_test.iloc[:, 2], y_pred,output_dict=True)
macro_f1_sgd_ser = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
random_forest_model = RandomForestClassifier()

# Fit the model on the training set
random_forest_model.fit(over_X_train, over_ytrain3)

# Predict on the testing set
y_pred = random_forest_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 2], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 2], y_pred)
classification_rep = classification_report(y_test.iloc[:, 2], y_pred,output_dict=True)
macro_f1_rf_ser = classification_rep['macro avg']['f1-score']

In [None]:
logistic_regression_model = LogisticRegression()

# Fit the model on the training set
logistic_regression_model.fit(over_X_train, over_ytrain3)

# Predict on the testing set
y_pred = logistic_regression_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:,2], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:,2], y_pred)
classification_rep = classification_report(y_test.iloc[:, 2], y_pred,output_dict=True)
macro_f1_logistic_ser = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
NB_model = BernoulliNB()

# Fit the model on the training set
NB_model.fit(over_X_train, over_ytrain3)

# Predict on the testing set
y_pred = NB_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 2], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 2], y_pred)
classification_rep = classification_report(y_test.iloc[:, 2], y_pred,output_dict=True)
macro_f1_nb_ser = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
DT_model = tree.DecisionTreeClassifier()

# Fit the model on the training set
DT_model.fit(over_X_train, over_ytrain3)

# Predict on the testing set
y_pred = DT_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 2], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 2], y_pred)
classification_rep = classification_report(y_test.iloc[:, 2], y_pred,output_dict=True)
macro_f1_dt_ser = classification_rep['macro avg']['f1-score']

In [None]:
clf = SVC()
clf.fit(over_X_train, over_ytrain3)
y_pred = clf.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 2], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 2], y_pred)
classification_rep = classification_report(y_test.iloc[:, 2], y_pred,output_dict=True)
macro_f1_svc_ser = classification_rep['macro avg']['f1-score']

In [None]:
macro_f1_ser=[macro_f1_nb_ser,macro_f1_dt_ser,macro_f1_rf_ser,macro_f1_sgd_ser,macro_f1_logistic_ser,macro_f1_svc_ser]

**Convenience**

In [None]:
ytrain4 = y_train.iloc[:,3]
oversample = SMOTE()
over_X_train, over_ytrain4 = oversample.fit_resample (vector_X_train, ytrain4)

logistic_regression_model = LogisticRegression()

# Fit the model on the training set
logistic_regression_model.fit(over_X_train, over_ytrain4)

# Predict on the testing set
y_pred = logistic_regression_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:,3], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:,3], y_pred)
classification_rep = classification_report(y_test.iloc[:, 3], y_pred,output_dict=True)
macro_f1_logistic_con = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the SGDClassifier
linear_sgd_model = SGDClassifier()

# Fit the model on the training set
linear_sgd_model.fit(over_X_train, over_ytrain4)

# Predict on the testing set
y_pred = linear_sgd_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 3], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 3], y_pred)
classification_rep = classification_report(y_test.iloc[:, 3], y_pred,output_dict=True)
macro_f1_sgd_con = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
random_forest_model = RandomForestClassifier()

# Fit the model on the training set
random_forest_model.fit(over_X_train, over_ytrain4)

# Predict on the testing set
y_pred = random_forest_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 3], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 3], y_pred)
classification_rep = classification_report(y_test.iloc[:, 3], y_pred,output_dict=True)
macro_f1_rf_con = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
DT_model = tree.DecisionTreeClassifier()

# Fit the model on the training set
DT_model.fit(over_X_train, over_ytrain4)

# Predict on the testing set
y_pred = DT_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 3], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 3], y_pred)
classification_rep = classification_report(y_test.iloc[:, 3], y_pred,output_dict=True)
macro_f1_dt_con = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
NB_model = BernoulliNB()

# Fit the model on the training set
NB_model.fit(over_X_train, over_ytrain4)

# Predict on the testing set
y_pred = NB_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 3], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 3], y_pred)
classification_rep = classification_report(y_test.iloc[:, 3], y_pred,output_dict=True)
macro_f1_nb_con = classification_rep['macro avg']['f1-score']

In [None]:
clf = SVC()
clf.fit(over_X_train, over_ytrain4)
y_pred = clf.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 3], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 3], y_pred)
classification_rep = classification_report(y_test.iloc[:, 3], y_pred,output_dict=True)
macro_f1_svc_con = classification_rep['macro avg']['f1-score']

In [None]:
macro_f1_con=[macro_f1_nb_con,macro_f1_dt_con,macro_f1_rf_con,macro_f1_sgd_con,macro_f1_logistic_con,macro_f1_svc_con]

**Cost effective**

In [None]:
ytrain5 = y_train.iloc[:,4]
oversample = SMOTE()
over_X_train, over_ytrain5 = oversample.fit_resample (vector_X_train, ytrain5)

logistic_regression_model = LogisticRegression()

# Fit the model on the training set
logistic_regression_model.fit(over_X_train, over_ytrain5)

# Predict on the testing set
y_pred = logistic_regression_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:,4], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:,4], y_pred)
classification_rep = classification_report(y_test.iloc[:, 4], y_pred,output_dict=True)
macro_f1_logistic_ce = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the SGDClassifier
linear_sgd_model = SGDClassifier()

# Fit the model on the training set
linear_sgd_model.fit(over_X_train, over_ytrain5)

# Predict on the testing set
y_pred = linear_sgd_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 4], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 4], y_pred)
classification_rep = classification_report(y_test.iloc[:, 4], y_pred,output_dict=True)
macro_f1_sgd_ce = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
random_forest_model = RandomForestClassifier()

# Fit the model on the training set
random_forest_model.fit(over_X_train, over_ytrain5)

# Predict on the testing set
y_pred = random_forest_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 4], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 4], y_pred)
classification_rep = classification_report(y_test.iloc[:, 4], y_pred,output_dict=True)
macro_f1_rf_ce = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
DT_model = tree.DecisionTreeClassifier()

# Fit the model on the training set
DT_model.fit(over_X_train, over_ytrain5)

# Predict on the testing set
y_pred = DT_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 4], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 4], y_pred)
classification_rep = classification_report(y_test.iloc[:, 4], y_pred,output_dict=True)
macro_f1_dt_ce = classification_rep['macro avg']['f1-score']

In [None]:
# Create an instance of the RandomForestClassifier
NB_model = BernoulliNB()

# Fit the model on the training set
NB_model.fit(over_X_train, over_ytrain5)

# Predict on the testing set
y_pred = NB_model.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 4], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 4], y_pred)
classification_rep = classification_report(y_test.iloc[:, 4], y_pred,output_dict=True)
macro_f1_nb_ce = classification_rep['macro avg']['f1-score']

In [None]:
clf = SVC()
clf.fit(over_X_train, over_ytrain5)
y_pred = clf.predict(vector_X_test)

# Evaluate the model
accuracy = accuracy_score(y_test.iloc[:, 4], y_pred)
conf_matrix = confusion_matrix(y_test.iloc[:, 4], y_pred)
classification_rep = classification_report(y_test.iloc[:, 4], y_pred,output_dict=True)
macro_f1_svc_ce = classification_rep['macro avg']['f1-score']

In [None]:
macro_f1_ce=[macro_f1_nb_ce,macro_f1_dt_ce,macro_f1_rf_ce,macro_f1_sgd_ce,macro_f1_logistic_ce,macro_f1_svc_ce]

In [None]:
model=["Naive Bayes","Decision Tree", "Random Forest", "SGD", "Logistic","SVM"]
data = {'Model': model, 'Food': macro_f1_food, 'Environment': macro_f1_en, "Service":macro_f1_ser,
        "Convinience":macro_f1_con, "Cost Effectiveness":macro_f1_ce}

In [None]:
df = pd.DataFrame(data)
df = df.T
df = df.drop(df.index[0])

In [None]:
f1_metric = f1_metric.T
f1_metric

Unnamed: 0,decision_tree,naive_bayes,linear_SGD_classifier,logistic_regression,random_forest,svc
food_quality,0.441558,0.619048,0.320346,0.616883,0.616883,0.545455
environment,0.590062,0.501035,0.283644,0.534161,0.575569,0.52588
service,0.679128,0.694704,0.35514,0.838006,0.800623,0.781931
convenience,0.321895,0.333333,0.163399,0.334967,0.333333,0.333333
cost_effectiveness,0.5918,0.411765,0.208556,0.513369,0.465241,0.486631


In [None]:
f1_metric['Naive Bayes_word2vec']= list(df.iloc[:,0])
f1_metric['Decision Tree_word2vec']= list(df.iloc[:,1])
f1_metric['Random Forest_word2vec']= list(df.iloc[:,2])
f1_metric['SGD_word2vec']= list(df.iloc[:,3])
f1_metric['Logistic_word2vec']= list(df.iloc[:,4])
f1_metric['SVM_word2vec']= list(df.iloc[:,5])
f1_metric['BERT']= [ 0.751224, 0.553662, 0.696538, 0.324600, 0.534528]
f1_metric

Unnamed: 0,decision_tree,naive_bayes,linear_SGD_classifier,logistic_regression,random_forest,svc,Naive Bayes_word2vec,Decision Tree_word2vec,Random Forest_word2vec,SGD_word2vec,Logistic_word2vec,SVM_word2vec,BERT
food_quality,0.441558,0.619048,0.320346,0.616883,0.616883,0.545455,0.405818,0.394208,0.457199,0.482056,0.547482,0.504666,0.751224
environment,0.590062,0.501035,0.283644,0.534161,0.575569,0.52588,0.391085,0.336394,0.486113,0.405318,0.346575,0.477971,0.553662
service,0.679128,0.694704,0.35514,0.838006,0.800623,0.781931,0.457679,0.440166,0.561728,0.575872,0.560867,0.582082,0.696538
convenience,0.321895,0.333333,0.163399,0.334967,0.333333,0.333333,0.332294,0.327851,0.32304,0.301727,0.285366,0.339765,0.3246
cost_effectiveness,0.5918,0.411765,0.208556,0.513369,0.465241,0.486631,0.385016,0.31893,0.370216,0.374258,0.348135,0.452208,0.534528


In [None]:
from scipy import stats

# Assuming 'table' is your DataFrame
num_models = 6
alpha = 0.05

for i in range(12):
    model_i_column = f1_metric.iloc[:, i]
    model_j_column = f1_metric.iloc[:, -1]
    # Perform paired t-test
    t_statistic, p_value = stats.ttest_rel(model_j_column, model_i_column, alternative='greater')
    # Check if the p-value is less than alpha
    if p_value < alpha:
        print(f"Model BERT and Model {i+1}: The mean of Model BERT is significantly greater than the mean of Model {i+1}.")
    else:
        print(f"Model BERT and Model {i+1}: There is no significant difference in means, or the mean of Model {i+1} is greater.")

Model BERT and Model 1: There is no significant difference in means, or the mean of Model 1 is greater.
Model BERT and Model 2: There is no significant difference in means, or the mean of Model 2 is greater.
Model BERT and Model 3: The mean of Model BERT is significantly greater than the mean of Model 3.
Model BERT and Model 4: There is no significant difference in means, or the mean of Model 4 is greater.
Model BERT and Model 5: There is no significant difference in means, or the mean of Model 5 is greater.
Model BERT and Model 6: There is no significant difference in means, or the mean of Model 6 is greater.
Model BERT and Model 7: The mean of Model BERT is significantly greater than the mean of Model 7.
Model BERT and Model 8: The mean of Model BERT is significantly greater than the mean of Model 8.
Model BERT and Model 9: The mean of Model BERT is significantly greater than the mean of Model 9.
Model BERT and Model 10: The mean of Model BERT is significantly greater than the mean o

# RNN

In [None]:
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Assuming vector_X_train and vector_X_test are defined earlier

# Number of loops (adjust based on the number of columns in y_train)
num_loops = 5
timesteps = 128  # Set the value based on your data

# Store history for each model
history_list = []

for i in range(num_loops):
    print(i)
    # Extract y_train and y_test for the current loop
    y_train_loop = y_train.iloc[:, i]
    y_test_loop = y_test.iloc[:, i]

    # Create a mapping dictionary
    class_mapping_loop = {-1: 0, 0: 1, 1: 2}

    # Map values using the dictionary
    y_train_map_loop = y_train_loop.map(class_mapping_loop)
    y_test_map_loop = y_test_loop.map(class_mapping_loop)

    # Convert to numpy arrays
    y_train_np_loop = np.array(y_train_map_loop)
    y_test_np_loop = np.array(y_test_map_loop)

    # Compute class weights
    class_labels_loop = np.unique(y_train_map_loop)
    class_weights_loop = compute_class_weight(class_weight='balanced', classes=class_labels_loop, y=y_train_map_loop)
    class_weights_dict_loop = dict(zip(class_labels_loop, class_weights_loop))

    # Reshape X for LSTM
    X_train_np_loop = np.array(vector_X_train)
    X_test_np_loop = np.array(vector_X_test)
    X_train_np_loop = X_train_np_loop.reshape(-1, timesteps, 1)
    X_test_np_loop = X_test_np_loop.reshape(-1, timesteps, 1)

    # Build and compile the model
    model_loop = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(timesteps, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model_loop.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(1e-4),
        metrics=['accuracy']
    )

    # Train the model
    history_loop = model_loop.fit(
        X_train_np_loop, y_train_np_loop,
        epochs=10,
        validation_data=(X_test_np_loop, y_test_np_loop),
        class_weight=class_weights_dict_loop
    )

    # Store history for later analysis
    history_list.append(history_loop)


0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time
print(f"Elapsed Time: {elapsed_time} seconds")

Elapsed Time: 411.41179156303406 seconds
