In [1]:
import pandas as pd
import pickle
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
import numpy as np

In [None]:
tf.lookup.

In [None]:
tfds.load

In [24]:
TRAIN_SIZE = 0.8
SEQUENCE_LENGTH = 300
EPOCHS = 10
BATCH_SIZE = 1024
W2V_SIZE = 300

TRAIN_DS_CSV_PATH = 'cleaned_train_ds.csv'
TEST_DS_CSV_PATH = 'cleaned_test_ds.csv'
W2V_PKL_PATH = 'w2v_dict.pkl'

In [3]:
train_df = pd.read_csv(TRAIN_DS_CSV_PATH).fillna("")
test_df = pd.read_csv(TEST_DS_CSV_PATH).fillna("")
w2v_dict = pickle.load(open(W2V_PKL_PATH,'rb'))

In [4]:
%%time
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_df.text)
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 288473
CPU times: user 13.7 s, sys: 96.5 ms, total: 13.8 s
Wall time: 13.9 s


In [8]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_dict.keys():
        embedding_matrix[i] = w2v_dict[word]
print(embedding_matrix.shape)

(288473, 300)


In [5]:
%%time
x_train = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train_df.text), maxlen=SEQUENCE_LENGTH)
x_test = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test_df.text), maxlen=SEQUENCE_LENGTH)

CPU times: user 12.7 s, sys: 283 ms, total: 12.9 s
Wall time: 13 s


In [6]:
y_train = train_df.target.to_numpy().reshape(-1,1) / 4
y_test = test_df.target.to_numpy().reshape(-1,1) / 4

In [19]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (1600000, 300)
y_train (1600000, 1)

x_test (498, 300)
y_test (498, 1)


In [18]:
train_data = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(x_train), tf.data.Dataset.from_tensor_slices(y_train)))
test_data = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(x_test), tf.data.Dataset.from_tensor_slices(y_test)))

train_data = train_data.shuffle(buffer_size=len(x_train)).batch(BATCH_SIZE).repeat()
test_data = test_data.batch(BATCH_SIZE)

In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(SEQUENCE_LENGTH,)),
    tf.keras.layers.Embedding(vocab_size,W2V_SIZE,weights=[embedding_matrix],trainable=False),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 300)          86541900  
                                                                 
 conv1d_1 (Conv1D)           (None, 300, 128)          115328    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 150, 128)         0         
 1D)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 100)               91600     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                      

In [22]:
try:
    import tensorflow_addons as tfa
    model.compile(loss='binary_crossentropy',
                  optimizer=tfa.optimizers.AdamW(weight_decay=1e-4),
                  # optimizer="adam",
                  metrics=['accuracy'])
except ImportError:
    model.compile(loss='binary_crossentropy',
                  optimizer="adam",
                  metrics=['accuracy'])

In [30]:
callbacks = [
    # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
    # tf.keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5),
    # tf.keras.callbacks.ModelCheckpoint(filepath='weights.best.hdf5',save_weights_only=True,monitor='val_accuracy',mode='max',save_best_only=True)
]

In [26]:
%%time
history = model.fit(train_data,
                    steps_per_epoch=500,
                    epochs=EPOCHS,
                    # batch_size=BATCH_SIZE,
                    # epochs=EPOCHS,
                    # validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/10


2022-03-17 11:49:00.470107: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-17 11:49:02.450694: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-17 11:49:04.014103: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 7min 37s, sys: 4min 51s, total: 12min 29s
Wall time: 50min 14s


In [27]:
%%time
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

2022-03-17 16:04:50.015779: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-17 16:04:50.102620: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



ACCURACY: 0.6004016399383545
LOSS: 0.6152105331420898
CPU times: user 421 ms, sys: 346 ms, total: 767 ms
Wall time: 1.01 s
