In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Input DataFrame
df = pd.DataFrame({
    'text': [
        "Jim and Pam traveled by bus",
        "The Train was not on time.",
        "The flight was full. Traveling by flight is expensive."
    ],
    'output': [1, 1, 0]
})

print(df)
print('-' * 150)

# Function to print n-grams
def print_ngrams(df, ngram_range):
    print(f"Generating {ngram_range[0]}-gram to {ngram_range[1]}-gram representation:")
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(df['text'])
    
    vocabulary = vectorizer.vocabulary_
    
    print('-' * 50)
    print(f"Vocabulary (ngram_range={ngram_range}):")
    for i in vocabulary:
        print(i)
    
    print('-' * 50)
    print("Vocabulary Dictionary:")
    print(vocabulary)
    
    print('-' * 50)
    print("Vectorized Representation (Row-wise):")
    for idx in range(len(df)):
        print(f"Text {idx+1}: {X[idx].toarray()}")
    
    print('-' * 50)
    test_sentence = "Jim traveled by flight because flight is best for travelling."
    test_vectorized = vectorizer.transform([test_sentence]).toarray()
    print(f"Test Sentence: {test_sentence}")
    print("Test Sentence Vectorized Representation:")
    print(test_vectorized)
    print('-' * 150)

# Print unigram representation
print_ngrams(df, ngram_range=(1, 1))

# Print bigram representation
print_ngrams(df, ngram_range=(2, 2))

# Print trigram representation
print_ngrams(df, ngram_range=(3, 3))


                                                text  output
0                        Jim and Pam traveled by bus       1
1                         The Train was not on time.       1
2  The flight was full. Traveling by flight is ex...       0
------------------------------------------------------------------------------------------------------------------------------------------------------
Generating 1-gram to 1-gram representation:
--------------------------------------------------
Vocabulary (ngram_range=(1, 1)):
jim
and
pam
traveled
by
bus
the
train
was
not
on
time
flight
full
traveling
is
expensive
--------------------------------------------------
Vocabulary Dictionary:
{'jim': 7, 'and': 0, 'pam': 10, 'traveled': 14, 'by': 2, 'bus': 1, 'the': 11, 'train': 13, 'was': 16, 'not': 8, 'on': 9, 'time': 12, 'flight': 4, 'full': 5, 'traveling': 15, 'is': 6, 'expensive': 3}
--------------------------------------------------
Vectorized Representation (Row-wise):
Text 1: [[1 1 1 0 0 0 0 1 