In [2]:
import pandas as pd

df = pd.read_csv('balanced_reviews_with_enhanced_subjectivity.csv')


In [3]:
df.head()

Unnamed: 0,reviewText,subjectivity,enhanced_subjectivity
0,The material is also easy to clean and won't b...,1.0,1.0
1,There are a couple of ways to fasten it to the...,0.0,0.0
2,Very happy with purchase.,1.0,1.0
3,I should have purchase a larger one but it loo...,1.0,1.0
4,Very happy with the purchase.,1.0,1.0


In [4]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization and removing stopwords
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stem_words(words):
    return [ps.stem(word) for word in words]

df['cleaned_text'] = df['reviewText'].apply(clean_text)
df['stemmed_text'] = df['cleaned_text'].apply(stem_words)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np




# Parameters
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Preprocessing
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['reviewText'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df['reviewText'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(padded, np.array(df['enhanced_subjectivity']), test_size=0.2)

# Model Definition
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model
y_pred = model.predict(X_test).round().astype(int)
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, y_pred) * 100:.2f}%")
