In [22]:
import numpy as np
import pandas as pd


In [31]:
corpus = ['Hi I am Ram Diwedi',
          'I live in Jaipur!',
          'I, work as a shopkeeper']

In [28]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def custom_tokenizer(text):
  return word_tokenize(text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
import re
# Custom preprocessor function to remove punctuation
def remove_punctuation(text):
    # Using regular expression to remove punctuation
    return re.sub(r'[^\w\s]', '', text)


In [36]:
#Implementing Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(preprocessor=remove_punctuation,lowercase = True, stop_words ='english',tokenizer=custom_tokenizer,binary = False)
bow = cv.fit_transform(corpus)

print(cv.vocabulary_)
print('__________________________________________')
print(corpus)
print('__________________________________________')
print(bow.toarray())

{'Hi': 1, 'I': 2, 'Ram': 4, 'Diwedi': 0, 'live': 5, 'Jaipur': 3, 'work': 7, 'shopkeeper': 6}
__________________________________________
['Hi I am Ram Diwedi', 'I live in Jaipur!', 'I, work as a shopkeeper']
__________________________________________
[[1 1 1 0 1 0 0 0]
 [0 0 1 1 0 1 0 0]
 [0 0 1 0 0 0 1 1]]


In [41]:
#Implementing n-gram

from sklearn.feature_extraction.text import CountVectorizer

# Specify n-gram range (here, we consider unigrams and bigrams)
ngram_range = (2, 3)

cv = CountVectorizer(preprocessor=remove_punctuation,lowercase = True, stop_words ='english',ngram_range=ngram_range, tokenizer=custom_tokenizer,binary = False)

ngram = cv.fit_transform(corpus)

print(cv.vocabulary_)
print('__________________________________________')
print(corpus)
print('__________________________________________')
print(ngram.toarray())


{'Hi I': 0, 'I Ram': 2, 'Ram Diwedi': 8, 'Hi I Ram': 1, 'I Ram Diwedi': 3, 'I live': 4, 'live Jaipur': 9, 'I live Jaipur': 5, 'I work': 6, 'work shopkeeper': 10, 'I work shopkeeper': 7}
__________________________________________
['Hi I am Ram Diwedi', 'I live in Jaipur!', 'I, work as a shopkeeper']
__________________________________________
[[1 1 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 1 1 0 0 0 1 0]
 [0 0 0 0 0 0 1 1 0 0 1]]


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(preprocessor=remove_punctuation,lowercase = True, stop_words ='english', tokenizer=custom_tokenizer,binary = False)

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

print(tfidf_vectorizer.get_feature_names_out())
print('__________________________________________')
print(corpus)
print('__________________________________________')
print(tfidf_matrix.toarray())


['Diwedi' 'Hi' 'I' 'Jaipur' 'Ram' 'live' 'shopkeeper' 'work']
__________________________________________
['Hi I am Ram Diwedi', 'I live in Jaipur!', 'I, work as a shopkeeper']
__________________________________________
[[0.54645401 0.54645401 0.32274454 0.         0.54645401 0.
  0.         0.        ]
 [0.         0.         0.38537163 0.65249088 0.         0.65249088
  0.         0.        ]
 [0.         0.         0.38537163 0.         0.         0.
  0.65249088 0.65249088]]
