In [1]:
# Construct a pandas DataFrame using `read()`
import pandas as pd

df = pd.read_csv("./datasets/bbc_text_cls.csv")
df

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,Be careful how you code\n\nA new European dire...,tech
2223,US cyber security chief resigns\n\nThe man mak...,tech


# 1. Vocabulary
Create **Vocabulary** dictionaries:
- **word2idx:** All the unique words as keys with a corresponding unique ID as values.
- **idx2word:** The reverse of word2idx. It has the unique IDs as keys and their corresponding words as values.
## 1-1. word2idx

In [24]:
# Create a set of unique words in the corpus
import random

unique_words = set()
for doc in df["text"]:
    for word in doc.split():
        unique_words.add(word.lower())

unique_words.add('<unk>')
unique_words.add('<pad>')

# Sample 100 words
print(random.sample(tuple(unique_words), 100))

['(morrison', 'airline.', 'flamini,', 'sized', 'hungry,', 'macintosh', 'sing', 'thoroughness,', 'breakpoint', 'journals', 'unit."', 'michels,', 'pr&#233;cis', 'attackers', 'contributing', 'ever', 'art-form', 'semi-final,', 'future,"', '1888', 'tobacco', 'misfit', 'permits.', 'now,', 'emerges', 'ubisense', 'reuniting', 'telcos,', 'synonymous', 'really,', 'reimbursement', 'book,"', 'acclaim,', 'carla', 'non-nato', '2.9%', 'leeds,', 'discounted', 'premiership."', 'reckons', 'cory', 'mid-march,"', 'projects,"', 'hassan,', '$0.99', 'shield".', "contractors'", 'unknowns', 'treasury', 'barred', 'baht', 'communicating,"', '&#163;1.8m', 'paion,', 'scooted', 'emmanuelle', 'labelling', 'growth', "isinbayeva's", 'rods', 'consciously', '7:41.42.', 'exhibitor', 'queried', '40-30', 'random', 'bemoaning', 'vision,', 'marrying', 'periods.', 'centring', 'birkenhead', 'copenhagen', 'invested', 'trial,', 'samsung', 'pleasantly', 'rovers,', '(siac).', 'beckinsale', 'defined,', 'undelivered.', 'fit', 'skipp

In [25]:
# Create a dictionary to map each unique word to an index
import itertools

word2idx = {}
for idx, word in enumerate(unique_words):
    word2idx[word] = idx

# Return first 100 items
print(list(itertools.islice(word2idx.items(), 100)))

[('hinckley,', 0), ('bucking', 1), ('stores."', 2), ('8.6bn', 3), ('12.37', 4), ('osborne', 5), ('egelton,', 6), ('path,', 7), ('pay-out', 8), ('miller.', 9), ('most,', 10), ('struggling', 11), ("sender's", 12), ('activities,', 13), ('satirist', 14), ('3,217', 15), ('(team', 16), ('original.', 17), ('mac,"', 18), ('hughes.', 19), ('a$846m', 20), ('advanced,', 21), ('receivers', 22), ('intervention', 23), ('attack."', 24), ('"audioblogs"', 25), ('esson,', 26), ('assault,', 27), ('layout', 28), ('campaiging', 29), ('agency.', 30), ('kezman', 31), ('vein', 32), ('filmmaker', 33), ("employee's", 34), ('fatboy', 35), ('6,000-strong', 36), ('printers', 37), ('shield', 38), ('ingots', 39), ('leave.', 40), ('aid', 41), ('management."', 42), ('1920s.', 43), ('robber', 44), ('difference"', 45), ('puzzlement', 46), ('"musicians', 47), ('developed,', 48), ('aids,', 49), ('weh', 50), ('75p', 51), ('frisk,', 52), ('villagers', 53), ('lange', 54), ('serena)', 55), ('masayuki', 56), ('uk-controlled', 

In [38]:
len(list(unique_words))

60618

# 2. One-Hot Encoding
The process of one-hot encoding involves **Vocabulary Creation** & **Vector Representation** two steps.

1. Manual Implementation.
2. `pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)`
3. `sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse_output=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None, feature_name_combiner='concat')`
4. `tf.keras.utils.to_categorical(x, num_classes)`

## 2-1. Manual One-Hot Encoding
Each word is represented as a vector of `0`s and `1`s. The length of the vector is equal to the size of the vocabulary. Position in the vector corresponds to a specific word in the vocabulary. If the word is present in a particular text sample, its corresponding position in the vector is marked as `1`, and all other positions are `0`. This implies that each word is uniquely represented by a binary vector, with only one element being `1`, indicating its presence, and all others being `0`.

In [34]:
# Create one-hot encoded vectors for each word in the corpus
import numpy as np

one_hot_vec = []
for doc in df['text']:
    doc_vec = []
    for word in doc.split():
        vec = np.zeros(len(unique_words))
        vec[word2idx[word.lower()]] = 1
        doc_vec.append(vec)
    one_hot_vec.append(doc_vec)

# One-hot encoded vectors of the first 10 words from the first document
for i in range(10):
    print(one_hot_vec[0][i])

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [39]:
print(len(one_hot_vec[0]))

421


In [61]:
print(len(one_hot_vec[0][0]))

60618


## 2-2. One-Hot Encoding with pandas

In [123]:
d = {'text': ["Dog of War", "Cat and Dog Man"], 'labels': ["business", "tech"]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,text,labels
0,Dog of War,business
1,Cat and Dog Man,tech


In [120]:
import pandas as pd

unique_word_series = pd.Series(list(unique_words))
unique_word_series

0      cat
1       of
2      man
3    <unk>
4      dog
5      war
6    <pad>
7      and
dtype: object

In [121]:
unique_word_dummies = pd.get_dummies(unique_word_series)
unique_word_dummies 

Unnamed: 0,<pad>,<unk>,and,cat,dog,man,of,war
0,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,True,False
2,False,False,False,False,False,True,False,False
3,False,True,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False
5,False,False,False,False,False,False,False,True
6,True,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False,False


In [122]:
one_hot_vec = []
for doc in df['text']:
    doc_vec = []
    for word in doc.split():
        doc_vec.append(unique_word_dummies[word.lower()].values)
    one_hot_vec.append(doc_vec)

one_hot_vec

[[array([False, False, False, False,  True, False, False, False]),
  array([False,  True, False, False, False, False, False, False]),
  array([False, False, False, False, False,  True, False, False])],
 [array([ True, False, False, False, False, False, False, False]),
  array([False, False, False, False, False, False, False,  True]),
  array([False, False, False, False,  True, False, False, False]),
  array([False, False,  True, False, False, False, False, False])]]

## 2-3. One-Hot Encoding with scikit-learn

In [111]:
# Pad each document manually
doc_max_length = max(len(t) for t in token_ids)
for t in token_ids:
    t += [word2idx['<pad>']] * (doc_max_length - len(t))

# Check paddings
it = iter(token_ids)
the_len = len(next(it))
if not all(len(l) == the_len for l in it):
    raise ValueError('not all lists have same length!')
else:
    print("All lists have the same length.")

All lists have the same length.


In [130]:
unique_word_df = unique_word_series.to_frame()
unique_word_df 

Unnamed: 0,0
0,cat
1,of
2,man
3,<unk>
4,dog
5,war
6,<pad>
7,and


In [133]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
# unique_word_encoded = ohe.fit_transform(unique_word_series.values.reshape(-1, 1))
unique_word_encoded = ohe.fit(unique_word_df)
unique_word_encoded

In [134]:
unique_word_encoded.categories_

[array(['<pad>', '<unk>', 'and', 'cat', 'dog', 'man', 'of', 'war'],
       dtype=object)]

In [141]:
one_hot_vec = []
for doc in df['text']:
    doc_vec = []
    for word in doc.lower().split():
        doc_vec.append(ohe.transform(np.array(word).reshape(1, -1)))
    one_hot_vec.append(doc_vec)

one_hot_vec

[[array([[0., 0., 0., 0., 1., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 0., 0., 1., 0.]]),
  array([[0., 0., 0., 0., 0., 0., 0., 1.]])],
 [array([[0., 0., 0., 1., 0., 0., 0., 0.]]),
  array([[0., 0., 1., 0., 0., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 1., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 0., 1., 0., 0.]])]]

## 2-4. One-Hot Encoding with Keras

In [148]:
word2idx

{'cat': 0,
 'of': 1,
 'man': 2,
 '<unk>': 3,
 'dog': 4,
 'war': 5,
 '<pad>': 6,
 'and': 7}

In [145]:
# Split documents to tokens
tokens_docs = [doc.lower().split() for doc in df['text']]

# Convert token lists to token-id lists
token_ids = [[word2idx[token] for token in tokens_doc] for tokens_doc in tokens_docs]
print(token_ids)

[[4, 1, 5], [0, 7, 4, 2]]


In [147]:
import keras

a = keras.utils.to_categorical(list(range(len(unique_words))), num_classes=len(unique_words))
a 

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [151]:
one_hot_vec = []
for doc in df['text']:
    doc_vec = []
    for word in doc.lower().split():
        doc_vec.append(a[word2idx[word]])
    one_hot_vec.append(doc_vec)

one_hot_vec

[[array([0., 0., 0., 0., 1., 0., 0., 0.]),
  array([0., 1., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 1., 0., 0.])],
 [array([1., 0., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 1.]),
  array([0., 0., 0., 0., 1., 0., 0., 0.]),
  array([0., 0., 1., 0., 0., 0., 0., 0.])]]

# 2. Bag-of-Words (BoW)
## 2-1. Manual Implementation

## 2-2. CountVectorizer

# 3. TF-IDF
## 3-1. Manual Implementation
### 3-1-1. word2idx

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk import word_tokenize

words = word_tokenize(df["text"][0])
print(words)

In [None]:
# Populate word2idx
# Convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []

for doc in df["text"]:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        # Save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

tokenized_docs

### 3-1-2. idx2word

In [None]:
# Reverse mapping
idx2word = {v:k for k, v in word2idx.items()}
idx2word

In [None]:
# Number of documents
N = len(df['text'])

# Number of words
V = len(word2idx)

N, V

### 3-1-3. Term Frequency (TF)
**Term frequency (TF)** means how often a term occurs in a document.

In [None]:
import numpy as np

# Instantiate term-frequency matrix
tf = np.zeros((N, V))

# Populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

tf

### 3-1-4. Inverse Document Frequency (IDF)
- **Document frequency (DF)** is the number of documents containing a particular term.
- **Inverse Document Frequency (IDF)** is a weight indicating how commonly a word is used.

In [None]:
# Compute IDF
# `axis=0` is the direction running downward the rows
doc_freq = np.sum(tf > 0, axis=0)
idf = np.log(N / doc_freq)
idf

In [None]:
### TF-IDF

In [None]:
# Compute TF-IDF
tf_idf = tf * idf
tf_idf

In [None]:
# Pick a random document, show the top 5 terms (in terms of `tf_idf` score)
np.random.seed(36)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['label'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
# Add minus for descending
indices = (-scores).argsort()
for j in indices[:5]:
    print(idx2word[j])

## 3-2. CountVectorizer
Derived term frequencies from `CountVectorizer`.

In [None]:
inputs = df["text"]
labels = df["label"]

labels.hist(figsize=(10, 5));

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(inputs)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf

In [None]:
print(tf.toarray())

In [None]:
# Fewer words than `nltk.word_tokenize()`
tf.shape

In [None]:
# By default `lowercase=True`
# np.where(words == "India")
# (array([], dtype=int64),)

# By default `token_pattern=r”(?u)\b\w\w+\b`
# RegExp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator)
# np.where(words == "$")
# (array([], dtype=int64),)

np.where(words == "india")

In [None]:
# Same as `tf[0][0]` in the above section
tf.toarray()[0][13907]

In [None]:
# Compute IDF
doc_freq = np.sum(tf.toarray() > 0, axis=0)
idf = np.log(N / doc_freq)
idf

In [None]:
# Compute TF-IDF
tf_idf = tf.toarray() * idf
tf_idf

In [None]:
tf_idf.shape

In [None]:
# Same as `tf_idf[0][0]` in the above section
tf_idf[0][13907]

## 3-3. TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
tf_idf = transformer.fit_transform(tf)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf_idf

In [None]:
# The `sklearn` implementation of TF-IDF is different from our manual implementation 
print(tf_idf.toarray())

## 3-4. TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(inputs)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf_idf

In [None]:
# Same as `CountVectorizer()` followed by `TfidfTransformer()`
print(tf_idf.toarray())

Perform classification.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

inputs_train, inputs_test, y_train, y_test = train_test_split(inputs, labels, random_state=36)

vectorizer = CountVectorizer()

x_train = vectorizer.fit_transform(inputs_train)
x_test = vectorizer.transform(inputs_test)

model = MultinomialNB()
model.fit(x_train, y_train)

print("Train Score:", model.score(x_train, y_train))
print("Test Score:", model.score(x_test, y_test))

# 4. word2vec
## 4-1. CBOW (Continuous Bag of Words)
## 4-2. Skip-Gram
# 5. GloVe
# 6. FastText
# 7. Gaussian Embedding
# 8. Pointcare Embedding