# 1. One-Hot Encoding

# 2. Bag-of-Words (BoW)

# 3. TF-IDF
## 3-1. word2idx

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk import word_tokenize

words = word_tokenize(df["text"][0])
print(words)

In [None]:
# Populate word2idx
# Convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []

for doc in df["text"]:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        # Save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

tokenized_docs

### idx2word

In [None]:
# Reverse mapping
idx2word = {v:k for k, v in word2idx.items()}
idx2word

In [None]:
# Number of documents
N = len(df['text'])

# Number of words
V = len(word2idx)

N, V

### Term Frequency (TF)
**Term frequency (TF)** means how often a term occurs in a document.

In [None]:
import numpy as np

# Instantiate term-frequency matrix
tf = np.zeros((N, V))

# Populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

tf

### Inverse Document Frequency (IDF)
- **Document frequency (DF)** is the number of documents containing a particular term.
- **Inverse Document Frequency (IDF)** is a weight indicating how commonly a word is used.

In [None]:
# Compute IDF
# `axis=0` is the direction running downward the rows
doc_freq = np.sum(tf > 0, axis=0)
idf = np.log(N / doc_freq)
idf

In [None]:
### TF-IDF

In [None]:
# Compute TF-IDF
tf_idf = tf * idf
tf_idf

In [None]:
# Pick a random document, show the top 5 terms (in terms of `tf_idf` score)
np.random.seed(36)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['label'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
# Add minus for descending
indices = (-scores).argsort()
for j in indices[:5]:
    print(idx2word[j])

## CountVectorizer

In [None]:
inputs = df["text"]
labels = df["label"]

labels.hist(figsize=(10, 5));

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(inputs)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf

In [None]:
print(tf.toarray())

In [None]:
# Fewer words than `nltk.word_tokenize()`
tf.shape

In [None]:
# By default `lowercase=True`
# np.where(words == "India")
# (array([], dtype=int64),)

# By default `token_pattern=r”(?u)\b\w\w+\b`
# RegExp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator)
# np.where(words == "$")
# (array([], dtype=int64),)

np.where(words == "india")

In [None]:
# Same as `tf[0][0]` in the above section
tf.toarray()[0][13907]

In [None]:
# Compute IDF
doc_freq = np.sum(tf.toarray() > 0, axis=0)
idf = np.log(N / doc_freq)
idf

In [None]:
# Compute TF-IDF
tf_idf = tf.toarray() * idf
tf_idf

In [None]:
tf_idf.shape

In [None]:
# Same as `tf_idf[0][0]` in the above section
tf_idf[0][13907]

## TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
tf_idf = transformer.fit_transform(tf)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf_idf

In [None]:
# The `sklearn` implementation of TF-IDF is different from our manual implementation 
print(tf_idf.toarray())

## TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(inputs)
words = vectorizer.get_feature_names_out()
print(words)

In [None]:
tf_idf

In [None]:
# Same as `CountVectorizer()` followed by `TfidfTransformer()`
print(tf_idf.toarray())

## Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

inputs_train, inputs_test, y_train, y_test = train_test_split(inputs, labels, random_state=36)

vectorizer = CountVectorizer()

x_train = vectorizer.fit_transform(inputs_train)
x_test = vectorizer.transform(inputs_test)

model = MultinomialNB()
model.fit(x_train, y_train)

print("Train Score:", model.score(x_train, y_train))
print("Test Score:", model.score(x_test, y_test))