In [1]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Init Plugin
Init Graph Optimizer
Init Kernel
Tensor Flow Version: 2.5.0
Keras Version: 2.5.0

Python 3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33) 
[Clang 13.0.1 ]
Pandas 1.4.4
Scikit-Learn 1.1.2
GPU is available


## Word Embeddings and Classification using Deep Learning (CNN)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [3]:
# import data
train_vec_df = pd.read_csv("Data/clean_train_data.csv")
test_vec_df = pd.read_csv("Data/clean_test_data.csv")

In [9]:
text = train_vec_df.text

In [11]:
# Tokenizing
token = Tokenizer()
token.fit_on_texts(text)

In [12]:
vocab_size = len(token.word_index) + 1 # adding one as per Keras instruction
print(vocab_size)

22701


In [19]:
import itertools
dict(itertools.islice(token.word_index.items(), 20))

{'t': 1,
 'co': 2,
 'http': 3,
 'the': 4,
 'a': 5,
 'in': 6,
 'to': 7,
 'of': 8,
 'and': 9,
 'i': 10,
 'is': 11,
 'for': 12,
 'on': 13,
 'you': 14,
 'my': 15,
 'with': 16,
 'that': 17,
 'it': 18,
 'at': 19,
 'by': 20}

In [20]:
# text encoding
encoded_text = token.texts_to_sequences(text)
print(encoded_text[:10])

[[119, 4633, 24, 4, 868, 8, 21, 263, 138, 1619, 4634, 89, 40], [189, 45, 229, 799, 6954, 6955, 1404], [40, 1751, 1620, 7, 6956, 6, 6957, 24, 136, 6958, 20, 1752, 39, 441, 256, 57, 2158, 6, 714, 1405, 24, 1106], [835, 2921, 59, 4635, 1500, 256, 1405, 6, 96], [34, 100, 1221, 21, 320, 22, 6959, 2159, 30, 271, 22, 1500, 6960, 69, 5, 187], [2922, 378, 96, 1501, 800, 869, 6, 665, 6961, 563, 7, 1159, 399, 45, 4636, 1500], [218, 76, 870, 295, 1222, 836, 264, 8, 1753, 6, 6962, 1047, 2453, 1502], [46, 13, 230, 8, 4, 1933, 9, 10, 74, 110, 5, 45, 6, 4, 4637], [422, 43, 75, 256, 1304, 51, 6, 4, 594, 871, 4, 769], [46, 2454, 17, 4, 469, 11, 250, 7, 119, 278]]


In [21]:
text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [23]:
# padding the encoded numpy array by maximum word count i.e., 40 (refer to word_count plot in other notebook)
max_length = 40
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')
print(X.shape)
print(X)

(7613, 40)
[[ 119 4633   24 ...    0    0    0]
 [ 189   45  229 ...    0    0    0]
 [  40 1751 1620 ...    0    0    0]
 ...
 [2824 2401  709 ...    0    0    0]
 [  78 1145   41 ...    0    0    0]
 [   4  209   54 ...    0    0    0]]
