In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv("Datasets/amazon_reviews_labelled.csv")

In [5]:
text_values = df['PREPROCESSED_REVIEW_TEXT']

In [6]:
def load_glove_embeddings(file_path):
    embeddings_index = {} 
    word_to_index = {}
    index = 0

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
            word_to_index[word] = index
            index += 1

    return embeddings_index, word_to_index

glove_file_path = 'glove.6B/glove.6B.100d.txt'
glove_embeddings, word_to_index = load_glove_embeddings(glove_file_path)

In [7]:
#SIMPLE: CONVERT TEXT TO NUMERICAL REPRESENTATIONS
vectorised_text = []
for text in text_values:
    tokens = text.split()
    vectors = [glove_embeddings.get(word, np.random.rand(100)) for word in tokens]
    text_vector = np.mean(vectors, axis=0)
    vectorised_text.append(text_vector)
vectorised_text = np.array(vectorised_text)

In [11]:
#TRAIN-TEST SPLIT
X = vectorised_text
Y = df['LABEL_ENCODED']

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size = 0.2, random_state = 42
)

In [12]:
#TEST WITH LOGISTIC REGRESSION 
clf = LogisticRegression()
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)

In [16]:
#PRINT REPORT
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.59      0.60      0.60      2115
           1       0.59      0.59      0.59      2085

    accuracy                           0.59      4200
   macro avg       0.59      0.59      0.59      4200
weighted avg       0.59      0.59      0.59      4200

