In [1]:
import numpy as np

In [2]:
text_data = [
    'Well done',
    'What are u doing',
    'Good Night',
    'Looking like a woaw',
    'Arabian Nights',
    'Gadar 2',
    'India India',
    'Bharat Mata ki Jai',
    'Jai Shri Ram'
]

## Two Techniques of vectorization

### Count Vectorization

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
vector = CountVectorizer()

In [5]:
X = vector.fit_transform(text_data)

In [6]:
vector.get_feature_names_out()

array(['arabian', 'are', 'bharat', 'doing', 'done', 'gadar', 'good',
       'india', 'jai', 'ki', 'like', 'looking', 'mata', 'night', 'nights',
       'ram', 'shri', 'well', 'what', 'woaw'], dtype=object)

In [7]:
vector.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [8]:
print(vector.get_feature_names_out())
X.toarray()

['arabian' 'are' 'bharat' 'doing' 'done' 'gadar' 'good' 'india' 'jai' 'ki'
 'like' 'looking' 'mata' 'night' 'nights' 'ram' 'shri' 'well' 'what'
 'woaw']


array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]])

### Integer Encoding - Used in Deep Learning

In [9]:
from keras.preprocessing.text import Tokenizer

2023-11-09 23:16:30.380026: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-09 23:16:30.463406: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-09 23:16:30.464545: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
tokenizer = Tokenizer()

In [11]:
tokenizer.fit_on_texts(text_data)

In [13]:
tokenizer.word_index

{'india': 1,
 'jai': 2,
 'well': 3,
 'done': 4,
 'what': 5,
 'are': 6,
 'u': 7,
 'doing': 8,
 'good': 9,
 'night': 10,
 'looking': 11,
 'like': 12,
 'a': 13,
 'woaw': 14,
 'arabian': 15,
 'nights': 16,
 'gadar': 17,
 '2': 18,
 'bharat': 19,
 'mata': 20,
 'ki': 21,
 'shri': 22,
 'ram': 23}

In [15]:
tokenizer.word_counts

OrderedDict([('well', 1),
             ('done', 1),
             ('what', 1),
             ('are', 1),
             ('u', 1),
             ('doing', 1),
             ('good', 1),
             ('night', 1),
             ('looking', 1),
             ('like', 1),
             ('a', 1),
             ('woaw', 1),
             ('arabian', 1),
             ('nights', 1),
             ('gadar', 1),
             ('2', 1),
             ('india', 2),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 1),
             ('jai', 2),
             ('shri', 1),
             ('ram', 1)])

In [20]:
text_sequence = tokenizer.texts_to_sequences(text_data)

In [21]:
text_sequence

[[3, 4],
 [5, 6, 7, 8],
 [9, 10],
 [11, 12, 13, 14],
 [15, 16],
 [17, 18],
 [1, 1],
 [19, 20, 21, 2],
 [2, 22, 23]]

In [22]:
# Padding the sequence
from keras.utils import pad_sequences

In [23]:
sequences = pad_sequences(sequences=text_sequence,padding='post')

In [24]:
sequences

array([[ 3,  4,  0,  0],
       [ 5,  6,  7,  8],
       [ 9, 10,  0,  0],
       [11, 12, 13, 14],
       [15, 16,  0,  0],
       [17, 18,  0,  0],
       [ 1,  1,  0,  0],
       [19, 20, 21,  2],
       [ 2, 22, 23,  0]], dtype=int32)