In [1]:
# Construct a pandas DataFrame using `read()`
import pandas as pd

df = pd.read_csv("./datasets/bbc_text_cls.csv")
df

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,Be careful how you code\n\nA new European dire...,tech
2223,US cyber security chief resigns\n\nThe man mak...,tech


# 1. One-Hot Encoding
2. `pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)`
3. `sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse_output=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None, feature_name_combiner='concat')`

## 1-1. Manual Implementation
Create **Vocabulary** dictionaries:
- **word2idx:** All the unique words as keys with a corresponding unique ID as values.
- **idx2word:** The reverse of word2idx. It has the unique IDs as keys and their corresponding words as values. 

### 1-1-1. word2idx

In [3]:
unique_words = set()
for doc in df["text"]:
    for word in doc.split():
        unique_words.add(word.lower())
print(unique_words)



In [5]:
word2idx = {}
for idx, word in enumerate(unique_words):
    word2idx[word] = idx

word2idx

{'diseases,': 0,
 'treatment."': 1,
 'persecution.': 2,
 'inadvertently': 3,
 'arrival': 4,
 'bridge,': 5,
 'broadcaster".': 6,
 'cautioned.': 7,
 'sorry.': 8,
 "inc's": 9,
 'stocks.': 10,
 'infringes': 11,
 'tougher': 12,
 "visitors'": 13,
 't-online,': 14,
 'crime-fighting': 15,
 '"morally': 16,
 'torvalds': 17,
 '$39.20': 18,
 'fight,': 19,
 'hurts."': 20,
 'privacy': 21,
 '50p': 22,
 'micha': 23,
 'spectrum.': 24,
 "greenspan's": 25,
 'parent,': 26,
 'proposals",': 27,
 'well': 28,
 '47-year-old': 29,
 'laxton': 30,
 '6-2.': 31,
 'garden': 32,
 'pan-european': 33,
 '"directing"': 34,
 'optical-disc': 35,
 'csi,"': 36,
 'pumps': 37,
 'telcos,': 38,
 '"ragtime",': 39,
 '247p,': 40,
 '1,200': 41,
 'brit': 42,
 'should."': 43,
 'mature.': 44,
 'crude': 45,
 'sisters,': 46,
 'sequel.': 47,
 'notification': 48,
 'projections.': 49,
 'tb,': 50,
 'big-ticket': 51,
 'winterbottom.': 52,
 'collect': 53,
 'old-timers': 54,
 'internationally,': 55,
 'medium-sized': 56,
 'quick-fire,': 57,
 'wa

In [8]:
import numpy as np

one_hot_vec = []
for doc in df['text']:
    doc_vec = []
    for word in doc.split():
        vec = np.zeros(len(unique_words))
        vec[word2idx[word.lower()]] = 1
        doc_vec.append(vec)
    one_hot_vec.append(doc_vec)

print("One-hot encoded vectors for the first document:")
for vec in one_hot_vec[0]:
    print(vec)

One-hot encoded vectors for the first sentence:
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0.

# 2. Bag-of-Words (BoW)
## 2-1. Manual Implementation
## 2-2. CountVectorizer

# 3. TF-IDF
## 3-1. Manual Implementation
### 3-1-1. word2idx

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk import word_tokenize

words = word_tokenize(df["text"][0])
print(words)

In [None]:
# Populate word2idx
# Convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []

for doc in df["text"]:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        # Save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

tokenized_docs

### 3-1-2. idx2word

In [None]:
# Reverse mapping
idx2word = {v:k for k, v in word2idx.items()}
idx2word

In [None]:
# Number of documents
N = len(df['text'])

# Number of words
V = len(word2idx)

N, V

### 3-1-3. Term Frequency (TF)
**Term frequency (TF)** means how often a term occurs in a document.

In [None]:
import numpy as np

# Instantiate term-frequency matrix
tf = np.zeros((N, V))

# Populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

tf

### 3-1-4. Inverse Document Frequency (IDF)
- **Document frequency (DF)** is the number of documents containing a particular term.
- **Inverse Document Frequency (IDF)** is a weight indicating how commonly a word is used.

In [None]:
# Compute IDF
# `axis=0` is the direction running downward the rows
doc_freq = np.sum(tf > 0, axis=0)
idf = np.log(N / doc_freq)
idf

In [None]:
### TF-IDF

In [None]:
# Compute TF-IDF
tf_idf = tf * idf
tf_idf

In [None]:
# Pick a random document, show the top 5 terms (in terms of `tf_idf` score)
np.random.seed(36)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['label'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
# Add minus for descending
indices = (-scores).argsort()
for j in indices[:5]:
    print(idx2word[j])

## 3-2. CountVectorizer
Derived term frequencies from `CountVectorizer`.

In [None]:
inputs = df["text"]
labels = df["label"]

labels.hist(figsize=(10, 5));

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(inputs)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf

In [None]:
print(tf.toarray())

In [None]:
# Fewer words than `nltk.word_tokenize()`
tf.shape

In [None]:
# By default `lowercase=True`
# np.where(words == "India")
# (array([], dtype=int64),)

# By default `token_pattern=r”(?u)\b\w\w+\b`
# RegExp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator)
# np.where(words == "$")
# (array([], dtype=int64),)

np.where(words == "india")

In [None]:
# Same as `tf[0][0]` in the above section
tf.toarray()[0][13907]

In [None]:
# Compute IDF
doc_freq = np.sum(tf.toarray() > 0, axis=0)
idf = np.log(N / doc_freq)
idf

In [None]:
# Compute TF-IDF
tf_idf = tf.toarray() * idf
tf_idf

In [None]:
tf_idf.shape

In [None]:
# Same as `tf_idf[0][0]` in the above section
tf_idf[0][13907]

## 3-3. TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
tf_idf = transformer.fit_transform(tf)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf_idf

In [None]:
# The `sklearn` implementation of TF-IDF is different from our manual implementation 
print(tf_idf.toarray())

## 3-4. TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(inputs)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf_idf

In [None]:
# Same as `CountVectorizer()` followed by `TfidfTransformer()`
print(tf_idf.toarray())

Perform classification.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

inputs_train, inputs_test, y_train, y_test = train_test_split(inputs, labels, random_state=36)

vectorizer = CountVectorizer()

x_train = vectorizer.fit_transform(inputs_train)
x_test = vectorizer.transform(inputs_test)

model = MultinomialNB()
model.fit(x_train, y_train)

print("Train Score:", model.score(x_train, y_train))
print("Test Score:", model.score(x_test, y_test))

# 4. word2vec
## 4-1. CBOW (Continuous Bag of Words)
## 4-2. Skip-Gram
# 5. GloVe
# 6. FastText
# 7. Gaussian Embedding
# 8. Pointcare Embedding