In [1]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Init Plugin
Init Graph Optimizer
Init Kernel
Tensor Flow Version: 2.5.0
Keras Version: 2.5.0

Python 3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33) 
[Clang 13.0.1 ]
Pandas 1.4.4
Scikit-Learn 1.1.2
GPU is available


In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Word Embeddings and Classification using Deep Learning (CNN)

### Data preparation

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [3]:
# import data
train_vec_df = pd.read_csv("Data/clean_train_data.csv")
test_vec_df = pd.read_csv("Data/clean_test_data.csv")

In [9]:
text = train_vec_df.text

In [26]:
# Tokenizing
token = Tokenizer()
token.fit_on_texts(text)

# text encoding
encoded_text = token.texts_to_sequences(text)

In [27]:
print(encoded_text[:10])

[[119, 4633, 24, 4, 868, 8, 21, 263, 138, 1619, 4634, 89, 40], [189, 45, 229, 799, 6954, 6955, 1404], [40, 1751, 1620, 7, 6956, 6, 6957, 24, 136, 6958, 20, 1752, 39, 441, 256, 57, 2158, 6, 714, 1405, 24, 1106], [835, 2921, 59, 4635, 1500, 256, 1405, 6, 96], [34, 100, 1221, 21, 320, 22, 6959, 2159, 30, 271, 22, 1500, 6960, 69, 5, 187], [2922, 378, 96, 1501, 800, 869, 6, 665, 6961, 563, 7, 1159, 399, 45, 4636, 1500], [218, 76, 870, 295, 1222, 836, 264, 8, 1753, 6, 6962, 1047, 2453, 1502], [46, 13, 230, 8, 4, 1933, 9, 10, 74, 110, 5, 45, 6, 4, 4637], [422, 43, 75, 256, 1304, 51, 6, 4, 594, 871, 4, 769], [46, 2454, 17, 4, 469, 11, 250, 7, 119, 278]]


In [28]:
max_length = 40
vocab_size = len(token.word_index) + 1 # adding one as per Keras instruction

In [29]:
# padding the encoded numpy array by maximum word count i.e., 40 (refer to word_count plot in other notebook)
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')
print(X.shape)
print(X)

(7613, 40)
[[ 119 4633   24 ...    0    0    0]
 [ 189   45  229 ...    0    0    0]
 [  40 1751 1620 ...    0    0    0]
 ...
 [2824 2401  709 ...    0    0    0]
 [  78 1145   41 ...    0    0    0]
 [   4  209   54 ...    0    0    0]]


In [33]:
y = train_vec_df.target

### Model training

In [30]:
vec_size = 100

model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=max_length))

model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

Metal device set to: Apple M1


2022-11-06 21:41:57.549928: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-06 21:41:57.550693: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 100)           2270100   
_________________________________________________________________
conv1d (Conv1D)              (None, 39, 32)            6432      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 19, 32)            0         
_________________________________________________________________
dropout (Dropout)            (None, 19, 32)            0         
_________________________________________________________________
dense (Dense)                (None, 19, 32)            1056      
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 32)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 19, 16)            5

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y) # random state: random state of sampling

In [41]:
%%time

model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
  1/191 [..............................] - ETA: 1:14 - loss: 0.1425 - accuracy: 0.9688

2022-11-06 21:53:14.159455: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-11-06 21:53:18.280226: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1min 5s, sys: 31.1 s, total: 1min 36s
Wall time: 45.3 s


<tensorflow.python.keras.callbacks.History at 0x29262fdc0>