In [None]:
#GOAL - USE CNN FOR TEXT CLASSIFICATION WITH KERAS' TOKENIZER FOR PREPROCESSING
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [2]:
#READ THE FILE
df = pd.read_csv("Datasets/amazon_reviews_3.csv")
df = df[:5000]

In [3]:
text_features = df['PREPROCESSED_REVIEW_TEXT']
numerical_features = df['VERIFIED_PURCHASE']
labels = df['LABEL_ENCODED']

In [4]:
#TRAIN-TEST SPLIT
X_train_num, X_test_num, X_train_text, X_test_text, y_train, y_test = train_test_split(
    numerical_features,
    text_features,
    labels,
    test_size = 0.2,
    random_state = 42
)

In [5]:
#CONVERT THE TEXT TO SEQUENCES
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_text)

train_sequences = tokenizer.texts_to_sequences(X_train_text)
test_sequences = tokenizer.texts_to_sequences(X_test_text)

max_seq_length = 100 #CAN BE ADJUSTED
vocab_size = len(tokenizer.word_index) + 1

train_sequences = pad_sequences(train_sequences, maxlen=max_seq_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_seq_length)

In [6]:
#CONVERT NUMERICAL FEATURES TO NUMPY ARRAYS [MATCH INPUT SHAPE]
X_train_num = np.array(X_train_num).reshape(-1, 1)
X_test_num = np.array(X_test_num).reshape(-1, 1)

In [7]:
#CONCATENATE THE NUMERICAL  FEATURES WITH PREPROCESSED TEXT SEQUENCES
train_sequences = np.concatenate((train_sequences, X_train_num), axis=1)
test_sequences = np.concatenate((test_sequences, X_test_num), axis=1)

In [9]:
#CREATE THE MODEL: MAX_SEQ_LENGTH + NUMBER OF NUMERICAL FEATURES AS INPUT_LENGTH
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length + 1))

In [10]:
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=64, activation='relu'))

In [11]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_sequences, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb9b74e1330>

In [None]:
model.fit(train_sequences, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
 21/420 [>.............................] - ETA: 16s - loss: 1.0943 - accuracy: 0.4762