In [17]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [5]:
texts = df['REVIEW_TEXT']
labels = df['LABEL_ENCODED']

In [6]:
# Tokenize the text data
tokenized_texts = [text.split() for text in texts]

In [8]:
# Train the Word2Vec model
model = gensim.models.Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [9]:
# Function to convert a document to its vector representation
def document_vector(model, doc):
    doc_vector = np.zeros(model.vector_size)
    num_words = 0
    for word in doc:
        if word in model.wv:
            doc_vector += model.wv[word]
            num_words += 1
    if num_words > 0:
        doc_vector /= num_words
    return doc_vector


In [10]:
# Convert each document to its vector representation
vectorized_texts = [document_vector(model, doc) for doc in tokenized_texts]

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(vectorized_texts, labels, test_size=0.2, random_state=42)

In [12]:
# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Make predictions on the testing set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [15]:
print(f"Accuracy = {accuracy*100}%")

Accuracy = 61.21428571428571%


In [31]:
#COMPARE WITH USING TF-IDF VECTORISATION
X = df['PREPROCESSED_REVIEW_TEXT']
Y = df['LABEL_ENCODED']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [33]:
# Instantiate the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

In [34]:
train_vectors = vectorizer.fit_transform(X_train)

In [35]:
test_vectors = vectorizer.transform(X_test)

In [36]:
classifier = LogisticRegression()
classifier.fit(train_vectors, y_train)

In [37]:
predictions = classifier.predict(test_vectors)

In [38]:
print(f"Accuracy = {accuracy_score(y_test, predictions)*100}%")

Accuracy = 63.857142857142854%


In [27]:
#TRAIN WORRD2VEC ON PREPROCESSED DATA
texts = df['PREPROCESSED_REVIEW_TEXT']
labels = df['LABEL_ENCODED']

In [29]:
# Tokenize the text data
tokenized_texts = [text.split() for text in texts]

# Train the Word2Vec model
model = gensim.models.Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Convert each document to its vector representation
vectorized_texts = [document_vector(model, doc) for doc in tokenized_texts]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(vectorized_texts, labels, test_size=0.2, random_state=42)

# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [30]:
print(f"Accuracy = {accuracy*100}%")

Accuracy = 61.26190476190476%
