In [1]:
import pandas as pd
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk import pos_tag,ne_chunk
from nltk.stem import  PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

In [2]:
df=pd.read_csv("Womens Clothing E-Commerce Reviews.csv")


df=df[["Review Text","Rating"]]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Sentiment"]=df["Rating"].apply(lambda x : "Positive" if x>=4 else ("Neutral" if x==3 else "Negative") )
df.drop(columns="Rating",inplace=True)


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def pre_procesing(text):
    text = text.lower()
    tokens = word_tokenize(text)
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df["Review Text"]=df["Review Text"].apply(pre_procesing)

y = df["Sentiment"]
vector = TfidfVectorizer()
X= vector.fit_transform(df["Review Text"])

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

x_train, x_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, random_state=42, test_size=0.2, shuffle=True, stratify=y_resampled
)

In [3]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# Label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Padding sequences
X_dense = X.toarray()  # Assuming X is already vectorized
max_length = 100
X_padded = pad_sequences(X_dense, maxlen=max_length, padding='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Model definition
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))  # Adjust input_dim based on vocabulary size
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))  # Output layer for the number of classes

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model training with early stopping
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')



Epoch 1/10




[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 296ms/step - accuracy: 0.7649 - loss: 0.7216 - val_accuracy: 0.7566 - val_loss: 0.7247
Epoch 2/10
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 296ms/step - accuracy: 0.7739 - loss: 0.7034 - val_accuracy: 0.7566 - val_loss: 0.7263
Epoch 3/10
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 294ms/step - accuracy: 0.7749 - loss: 0.6976 - val_accuracy: 0.7566 - val_loss: 0.7252
Epoch 4/10
[1m453/453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 295ms/step - accuracy: 0.7753 - loss: 0.6938 - val_accuracy: 0.7566 - val_loss: 0.7262
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 91ms/step - accuracy: 0.7677 - loss: 0.7025
Test Accuracy: 0.7693
