In [5]:
corpus = [
    "I can't wait for the new season of my favorite show!",
    "The COVID-19 pandemic has affected millions of people worldwide.",
    "U.S. stocks fell on Friday after news of rising inflation.",
    "<html><body>Welcome to the website!</body></html>",
    "Python is a great programming language!!! ??"
]

## convert to lowercase, remove punctuation, numbers, special characters, and HTML tags

In [2]:
import re
import string
from bs4 import BeautifulSoup

In [3]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    return text

In [4]:
cleaned_corpus = [clean_text(doc) for doc in corpus]
print(cleaned_corpus)

['i cant wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']


## Tokenization

In [5]:
tokenized_corpus = [doc.split() for doc in cleaned_corpus]
print(tokenized_corpus)

[['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'covid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], ['htmlbodywelcome', 'to', 'the', 'websitebodyhtml'], ['python', 'is', 'a', 'great', 'programming', 'language']]


# Another way

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names_out()

array(['19', 'affected', 'after', 'body', 'can', 'covid', 'favorite',
       'fell', 'for', 'friday', 'great', 'has', 'html', 'inflation', 'is',
       'language', 'millions', 'my', 'new', 'news', 'of', 'on',
       'pandemic', 'people', 'programming', 'python', 'rising', 'season',
       'show', 'stocks', 'the', 'to', 'wait', 'website', 'welcome',
       'worldwide'], dtype=object)

In [7]:
print(X.toarray())

[[0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0]
 [1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1]
 [0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0]
 [0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0]]


In [8]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))

X = vectorizer2.fit_transform(corpus)

vectorizer2.get_feature_names_out()

array(['19 pandemic', 'affected millions', 'after news', 'body html',
       'body welcome', 'can wait', 'covid 19', 'favorite show', 'fell on',
       'for the', 'friday after', 'great programming', 'has affected',
       'html body', 'is great', 'millions of', 'my favorite',
       'new season', 'news of', 'of my', 'of people', 'of rising',
       'on friday', 'pandemic has', 'people worldwide',
       'programming language', 'python is', 'rising inflation',
       'season of', 'stocks fell', 'the covid', 'the new', 'the website',
       'to the', 'wait for', 'website body', 'welcome to'], dtype=object)

# NLTK For Stopwords

In [13]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def clean_text(text):
    STOPWORDS = stopwords.words('english')
    
    nopunc = [char for char in text if char not in string.punctuation] 

    nopunc = ''.join(nopunc)

    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wilsonbeima/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
