In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Import the dataset

df = pd.read_csv('/content/preprocessed_kindle_review .csv')

# Display the first few rows of the dataset
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/preprocessed_kindle_review .csv'

In [None]:
df.info()


In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=['rating', 'reviewText'], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df = df[['reviewText' , 'rating']]

In [None]:
df.head()

In [None]:
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove special characters and digits
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
        text = text.lower()
        text = text.strip()
        # Tokenize text
        tokens = text.split()
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        return tokens
    else:
        return []

In [None]:
# Apply preprocessing to the reviewText column and convert ratings to binary sentiment
df['cleaned_reviewText'] = df['reviewText'].apply(preprocess_text)
df['sentiment'] = df['rating'].apply(lambda x: 1 if x >= 3 else 0)

In [None]:
# Drop rows with empty cleaned_reviewText
df = df[df['cleaned_reviewText'].map(lambda d: len(d)) > 0]

In [None]:
df = df[['cleaned_reviewText', 'sentiment']]

In [None]:
df.head()

In [None]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=df['cleaned_reviewText'], vector_size=100, window=5, min_count=1, workers=4)

# Create a dictionary of word vectors
word_vectors = w2v_model.wv

In [None]:
# Function to compute the average word vectors for a review
def compute_average_word_vectors(tokens, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    num_words = 0.0
    index2word_set = set(model.index_to_key)

    for token in tokens:
        if token in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[token])

    if num_words > 0:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

# Compute average word vectors for each review
num_features = 100
df['feature_vector'] = df['cleaned_reviewText'].apply(lambda tokens: compute_average_word_vectors(tokens, word_vectors, num_features))

# Display the first few rows of the processed dataset with feature vectors
df.head()


In [None]:
# Prepare feature matrix and labels
X = np.array(df['feature_vector'].tolist())
y = df['sentiment'].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and validation sets
print(f'Training set shape: {X_train.shape}')
print(f'Validation set shape: {X_val.shape}')


In [None]:
# Initialize Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Validation Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)


In [None]:
####this was added extraaa

# Predict on the training set
y_train_pred = model.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.4f}')

# Predict on the validation set
y_val_pred = model.predict(X_val)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Generate classification report for validation set
report = classification_report(y_val, y_val_pred)
print('Classification Report:')
print(report)


In [None]:
# Function to predict sentiment of new reviews
def predict_sentiment(review, model, word_vectors, num_features):
    tokens = preprocess_text(review)
    feature_vector = compute_average_word_vectors(tokens, word_vectors, num_features)
    feature_vector = feature_vector.reshape(1, -1)
    prediction = model.predict(feature_vector)
    return 'Positive' if prediction[0] == 1 else 'Negative'

# Example predictions
new_reviews = [
    "I absolutely loved this book! The story was fantastic and the characters were well developed.",
    "This book was terrible. I couldn't even finish it. The plot was boring and predictable."
]

for review in new_reviews:
    sentiment = predict_sentiment(review, model, word_vectors, num_features)
    print(f'Review: {review}\nPredicted Sentiment: {sentiment}\n')
