In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

# Sample text corpus
corpus = ["I love dogs", "I love cats", "Dogs and cats are great"]

# Bag of Words (BoW)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Vectors:\n", X.toarray())

print("\n---\n")

# N-Grams (Bigrams)
vectorizer_ngram = CountVectorizer(ngram_range=(2, 2))
X_ngrams = vectorizer_ngram.fit_transform(corpus)
print("Vocabulary with Bigrams:", vectorizer_ngram.get_feature_names_out())
print("Bigram Vectors:\n", X_ngrams.toarray())

print("\n---\n")

# Term Frequency (TF)
count_matrix = X.toarray()
vocab = vectorizer.get_feature_names_out()
df_tf = pd.DataFrame(count_matrix, columns=vocab)

# Compute term frequency for each word in each document
df_tf = df_tf.div(df_tf.sum(axis=1), axis=0)
print("Term Frequency Table:")
print(df_tf)

print("\n---\n")

# TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("TF-IDF Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Vectors:\n", X_tfidf.toarray())

print("\n---\n")

# Hashing Vectorizer
hash_vectorizer = HashingVectorizer(n_features=10)
X_hash = hash_vectorizer.transform(corpus)
print("Hashed Vectors:\n", X_hash.toarray())