<a href="https://colab.research.google.com/github/udupa-varun/pyimagesearch_uni/blob/main/nlp/101/bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/bag-of-word/bag-of-word.zip
!unzip -qq bag-of-word.zip
%cd bag-of-word

--2024-01-12 22:27:22--  https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/bag-of-word/bag-of-word.zip
Resolving pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com (pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com)... 52.218.229.105, 3.5.81.129, 52.92.248.122, ...
Connecting to pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com (pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com)|52.218.229.105|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88412 (86K) [binary/octet-stream]
Saving to: ‘bag-of-word.zip’


2024-01-12 22:27:23 (1005 KB/s) - ‘bag-of-word.zip’ saved [88412/88412]

/content/bag-of-word


In [2]:
import re

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd

### Configuration of the architecture

In [3]:
class Config:
    # data to be used
    data_dict = {
        "sentence": [
            "Avengers is a great movie.",
            "I love Avengers it is great.",
            "Avengers is a bad movie.",
            "I hate Avengers.",
            "I didnt like the Avengers movie.",
            "I think Avengers is a bad movie.",
            "I love the movie.",
            "I think it is great."
        ],
        "sentiment": [
            "good",
            "good",
            "bad",
            "bad",
            "bad",
            "bad",
            "good",
            "good"
        ]
    }

    # list of stopwords
    stop_words = ["is", "a", "i", "it"]

    # define model training parameters
    epochs = 30
    batch_size = 10

    # define number of dense units
    dense_units = 50

config = Config()

### Preprocess data

In [4]:
def preprocess(sent_df, stop_words, key="sentence"):
    # loop over sentences
    for num in range(len(sent_df[key])):
        sentence = sent_df[key][num]
        sentence = re.sub(
            r"[^a-zA-z0-9]", " ", sentence.lower()
        ).split()

        # define a list for processed words
        new_words = list()

        # loop over words in each sentence
        # filter out the stop words
        for word in sentence:
            if word not in stop_words:
                new_words.append(word)

        # replace sentence with list of new words
        sent_df[key][num] = new_words

    return sent_df


def prepare_tokenizer(df, sent_key="sentence", output_key="sentiment"):
    # counters for tokenizer indices
    word_counter = 0
    label_counter = 0

    # placeholder for tokenizer
    text_dict = dict()
    label_dict = dict()

    # loop over the sentences
    for entry in df[sent_key]:
        # loop over each word and check if encountered before
        for word in entry:
            if word not in text_dict.keys():
                text_dict[word] = word_counter
                word_counter += 1

    # repeat for labels
    for label in df[output_key]:
        if label not in label_dict.keys():
            label_dict[label] = label_counter
            label_counter += 1

    return (text_dict, label_dict)

### Function to calculate bag of words

In [5]:
def calculate_bag_of_words(text, sentence):
    # create a dict for frequency check
    freq_dict = dict.fromkeys(text, 0)

    # loop over words in sentences
    for word in sentence:
        freq_dict[word] = sentence.count(word)

    return freq_dict

### Build bag of words model

In [6]:
def build_shallow_net():
    # define model
    model = Sequential()
    model.add(Dense(config.dense_units, input_dim=10, activation="relu"))
    model.add(Dense(config.dense_units, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))

    # compile model
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model

### Build tensorflow wrapper

In [7]:
def tensorflow_wrap(df):
    # create tokenizerr for sentences
    tokenizer_sentence = Tokenizer()

    # create tokenizer for labels
    tokenizer_labels = Tokenizer()

    # fit tokenizer on documents
    tokenizer_sentence.fit_on_texts(df["sentence"])

    # fit tokenizer on labels
    tokenizer_labels.fit_on_texts(df["sentiment"])

    # create vectors using tensorflow
    encoded_data = tokenizer_sentence.texts_to_matrix(
        texts=df["sentence"], mode="count"
    )

    # add label column
    labels = df["sentiment"]

    # correct label vectors
    for i in range(len(labels)):
        labels[i] = tokenizer_labels.word_index[labels[i]] - 1

    # return data and labels
    return (encoded_data[:, 1:], labels.astype("float32"))

### Train the models

In [8]:
# convert input data dict to a pandas df
df = pd.DataFrame.from_dict(config.data_dict)

# preprocess data frame and create data dicts
preprocessed_df = preprocess(sent_df=df, stop_words=config.stop_words)
(text_dict, label_dict) = prepare_tokenizer(df)

# init vector list
freq_list = list()

# build vectors from sentences
for sentence in df["sentence"]:
    # create entries for each sentence and update vector list
    entry_freq = calculate_bag_of_words(text=text_dict, sentence=sentence)
    freq_list.append(entry_freq)

# create empty df for vectors
final_df = pd.DataFrame()

# loop over vectors and concat them
for vector in freq_list:
    vector = pd.DataFrame([vector])
    final_df = pd.concat([final_df, vector], ignore_index=True)

# add label column to final data frame
final_df["label"] = df["sentiment"]

# convert label to corresponding vector
for i in range(len(final_df["label"])):
    final_df["label"][i] = label_dict[final_df["label"][i]]

# init vanilla model
print("[INFO] Compiling model...")
shallow_model = build_shallow_net()

# fit keras model on dataset
shallow_model.fit(
    final_df.iloc[:, 0:10],
    final_df.iloc[:, 10].astype("float32"),
    epochs=config.epochs,
    batch_size=config.batch_size
)

# create dataset using TF
train_x, train_y = tensorflow_wrap(df)

# init new model
print("[INFO] Compiling model with TF wrapped data...")
tf_model = build_shallow_net()

# fit keras model on TF dataset
tf_model.fit(
    train_x,
    train_y,
    epochs=config.epochs,
    batch_size=config.batch_size
)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["label"][i] = label_dict[final_df["label"][i]]


[INFO] Compiling model...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[INFO] Compiling model with TF wrapped data...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7e25c1db2a10>