<a href="https://colab.research.google.com/github/tauqueerdanish/Working_With_Text_Data/blob/main/Working_with_text_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **One Hot Encoding**

In [1]:
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential

In [2]:
import os
import shutil
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# Word level one hot encoding

samples = ["The cat sat on the mat.", "The dog ate my homework."]
token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 10
results = np.zeros(shape = (len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1.
print(results)

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [4]:
# Character level one hot encoding
import string

samples = ["The cat sat on the mat.", "The dog ate my homework."]
characters = string.printable
token_index = dict(zip(range(1, len(characters)+1), characters))
i2 = []
j2 = []
sample2 = []
character2 = []
index2 = []
max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
  for j, character in enumerate(sample):
    i2.append(i)
    sample2.append(sample)
    j2.append(j)
    character2.append(character)
    index = token_index.get(character)
    index2.append(index)
    results[i, j, index] = 1.

print(i2)
print("-------------------------------------------------")
print(j2)
print("-------------------------------------------------")
print(sample2)
print("-------------------------------------------------")
print(character2)
print("-------------------------------------------------")
print(index2)
print("-------------------------------------------------")
print(results)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-------------------------------------------------
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
-------------------------------------------------
['The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the mat.', 'The cat sat on the m

In [5]:
# Use kears for word level one hot encoding
from tensorflow.keras.preprocessing.text import Tokenizer

samples = ["The cat sat on the mat.", "The dog ate my homework."]
tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
one_hot_results = tokenizer.texts_to_matrix(samples, mode="binary")
word_index = tokenizer.word_index
print("Found %s uninque tokens." %len(word_index))

Found 9 uninque tokens.


In [6]:
# Word level one hot encoding with hashing trick
samples = ["The cat sat on the mat.", "The dog ate my homework."]
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = abs(hash(word)) % dimensionality
    results[i, j, index] = 1.

In [7]:
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [8]:
#Word embedding (no of possible tokens, dimensionality of embeddings)
embedding_layer = Embedding(1000, 64)

In [9]:
#Loading imdb dataset for working with word embedding

max_features = 10000
maxlen = 20
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

#Turns the list of integers into 2d integer tensor of shape(samples, maxlen)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [10]:
#Using and embedding layer and classifier on the IMDB dataset
model = Sequential()
model.add(Embedding(max_features, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

model.compile(
    optimizer ="rmsprop",
    loss = "binary_crossentropy",
    metrics = ["acc"]
)

model.summary()


history = model.fit(
    x_train, y_train,
    epochs = 10,
    batch_size = 32,
    validation_split = 0.2
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 8)             80000     
                                                                 
 flatten (Flatten)           (None, 160)               0         
                                                                 
 dense (Dense)               (None, 1)                 161       
                                                                 
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
#Using prerained word embeddings

In [19]:
# First we will extract the folder
#shutil.unpack_archive("/content/drive/MyDrive/aclImdb.zip", "/content/drive/MyDrive/aclImdb_1")

In [20]:
imdb_dir = "/content/drive/MyDrive/aclImdb_1/aclImdb"
train_dir = os.path.join(imdb_dir,"train")

labels = []
texts = []

for label_type in ["neg","pos"]:
  dir_name = os.path.join(train_dir, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == ".txt":
        f = open(os.path.join(dir_name, fname))
        texts.append(f.read())
        f.close()
        if label_type == "neg":
            labels.append(0)
        else:
            labels.append(1)


In [21]:
#We use only first 200 samples because we are using the pretrained embedding

#cutoff the reviews after 100 words
maxlen = 100
training_samples = 200   #take only 200 samples
validation_samples = 10000   #validate on 10,000 samples
max_words = 10000       #considers only the 10,000 top words in the dataset

tokenizer = Tokenizer(num_words= max_words)
tokenizer.fit_on_texts(texts)
sequence = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("Fount %s unique tokens."%len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print("Shape of data tensor:", data.shape)
print("shape of label tensor:", labels.shape)

#Now, we should have tosplit the data into training and validation set. so, first we will shuffle the data because 
# data is ordered and negative are on first and positive are on second number.
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = data[: training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = data[training_samples: training_samples + validation_samples]

Fount 88582 unique tokens.
Shape of data tensor: (2, 100)
shape of label tensor: (25000,)


In [None]:
# shutil.unpack_archive("/content/drive/MyDrive/glove.6B.zip", "/content/drive/MyDrive/glove.6B")

In [22]:
glove_dir = "/content/drive/MyDrive/glove.6B"

embeddings_index = {}

f=open(os.path.join(glove_dir, "glove.6B.100d.txt"))
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype="float32")
  embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." %len(embeddings_index))

Found 400000 word vectors.


In [23]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
  if i<max_words:
    embedding_vector= embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector