In [7]:
#GOAL - USE CNN FOR TEXT CLASSIFICATION WITH KERAS' TOKENIZER FOR PREPROCESSING
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [3]:
#READ THE FILE
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [6]:
#USE THE COLUMN WITH ALREADY LEMMATIZED & CLEANED TEXT
text_features = df['PREPROCESSED_REVIEW_TEXT']
labels = df['LABEL_ENCODED']

In [8]:
#SPLIT INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(
    text_features, labels,
    test_size=0.2,
    random_state = 42
)

In [9]:
#CONVERT THE TEXTS TO SEQUENCES
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [10]:
max_seq_length = 100 #CAN BE ADJUSTED

In [11]:
train_sequences = pad_sequences(train_sequences, maxlen=max_seq_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_seq_length)

In [13]:
vocab_size = len(tokenizer.word_index) + 1

In [14]:
#CREATE THE MODEL
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [15]:
#COMPILE THE MODEL
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
#TRAIN THE MODEL
model.fit(train_sequences, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff3ccb74cd0>

In [17]:
loss, accuracy = model.evaluate(test_sequences, y_test)



In [18]:
print(loss, accuracy)

1.7193689346313477 0.607619047164917
