# Bag of Words

## Text cleaning technqiues:

- Ignoring case
- Ignoring punctuation
- Ignoring frequent words that don’t contain much information, called stop words, like “a,” “of,” etc.
- Fixing misspelled words.
- Reducing words to their stem (e.g. “play” from “playing”) using stemming algorithms.

In [None]:
corpus = [
    "Natural Language Processing is fascinating!", 
    "Machine learning provides tools for NLP.", 
    "Text data is unstructured and messy!", 
    "Deep learning models are powerful for understanding language.", 
    "I love building projects related to AI, ML, and NLP.", 
    "Clean data is key to successful machine learning models."
]

In [6]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Define your input text
text = "This is a simple example to demonstrate removing stopwords."

# Process the text
doc = nlp(text)
print(f"doc: {doc}")

# Remove stopwords
filtered_text = ' '.join(token.text for token in doc if not token.is_stop)
print(filtered_text)  # Output: "simple example demonstrate removing stopwords."


doc: This is a simple example to demonstrate removing stopwords.
simple example demonstrate removing stopwords .


In [None]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")


def clean_text(documents: list[str]):
    cleaned_docs = []
    for doc in documents:
        doc = nlp(re.sub(r"[^\w\s]", "", doc.lower()))
        filtered_text = [token.text for token in doc if not token.is_stop]
        cleaned_docs.append(filtered_text)

    return cleaned_docs

cleaned_corpus = clean_text(corpus)
cleaned_corpus

[['natural', 'language', 'processing', 'fascinating'],
 ['machine', 'learning', 'provides', 'tools', 'nlp'],
 ['text', 'data', 'unstructured', 'messy'],
 ['deep', 'learning', 'models', 'powerful', 'understanding', 'language'],
 ['love', 'building', 'projects', 'related', 'artificial', 'intelligence'],
 ['clean', 'data', 'key', 'successful', 'machine', 'learning', 'models']]

In [17]:
def bag_of_words(documents: list[str]):
    vocabulary = set()
    for doc in documents:
        vocabulary.update(doc)
    vocabulary = sorted(list(vocabulary))

    vectors = []
    for doc in documents:
        vector = [0] * len(vocabulary)
        for term in doc:
            v_i = vocabulary.index(term)
            vector[v_i] += 1

        vectors.append(vector)

    return vocabulary, vectors

In [18]:
import pandas as pd

vocab, vectors = bag_of_words(cleaned_corpus)
df = pd.DataFrame(data=vectors, columns=vocab)
df

Unnamed: 0,artificial,building,clean,data,deep,fascinating,intelligence,key,language,learning,...,powerful,processing,projects,provides,related,successful,text,tools,understanding,unstructured
0,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,0,0,0,1,0,0,0,1,1,...,1,0,0,0,0,0,0,0,1,0
4,1,1,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
5,0,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [21]:
sentences = []

for vector in vectors:
    sent = ""
    for i, v in enumerate(vector):
        if v == 1:
            term = vocab[i]
            sent += " " + term

    sentences.append(sent.strip())

sentences

['fascinating language natural processing',
 'learning machine nlp provides tools',
 'data messy text unstructured',
 'deep language learning models powerful understanding',
 'artificial building intelligence love projects related',
 'clean data key learning machine models successful']