In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus=['UNC played Duke in Basketball','Duke lost the Basketball game','I ate a sandwich']
vectorizer=CountVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 2 2 1 1 1 1 1]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus=['He ate the sandwiches','Every sandwich was eaten by him']
vectorizer=CountVectorizer(binary=True,stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)



[[1 1 1 1]]
{'ate': 0, 'sandwiches': 3, 'sandwich': 2, 'eaten': 1}


In [None]:
import nltk
nltk.download("popular")

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
corpo=['I am gathering ingredients for the sandwich','There were many wizards at the gathering']
lematizer=WordNetLemmatizer()
print(lematizer.lemmatize('gathering','v'))
print(lematizer.lemmatize('gathering','n'))

In [None]:
#tokenizing

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
wordnet_tags=['n','v']
stemmer=PorterStemmer()
corpus=['He ate the sandwiches','Every sandwich was eaten by him']
#Initially, we stem the documents in the corpus
print("Stemmed:",[[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])
#POS tagging
tagged_corpus=[pos_tag(word_tokenize(document)) for document in corpus]
#function for lemmatizing the postagged words
def lemmat(token,tag):
  if tag[0].lower() in ['n','v']:
    return lemmatizer.lemmatize(token,tag[0].lower())
  return token
#lemmatizing words
lemmatizer=WordNetLemmatizer()
print("Pos-tagged words: ", tagged_corpus)
print("Lemmatize: ",[[lemmat(token,tag) for token,tag in document] for document in tagged_corpus])

Stemmed: [['He', 'ate', 'the', 'sandwich'], ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Pos-tagged words:  [[('He', 'PRP'), ('ate', 'VBD'), ('the', 'DT'), ('sandwiches', 'NNS')], [('Every', 'DT'), ('sandwich', 'NN'), ('was', 'VBD'), ('eaten', 'VBN'), ('by', 'IN'), ('him', 'PRP')]]
Lemmatize:  [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


In [None]:
#instead of using a binary value for each element in feature vector, we will use an integer that denotes the number of times the word
#has appeared in the document
from sklearn.feature_extraction.text import CountVectorizer
corpus=['The dog ate a sandwich, the wizard transfigured a sandwich,and I ate a sandwich']
vectorizer=CountVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)
#print(vectorizer.vocabulary_['dog'])

[[2 1 3 1 1]]
{'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}
1


In [None]:
#Normalization, logarithmically scaled term frequencies and augmented term freq can represent the frequencies of terms in a document while mitigating
#the effects of diff document sizes. However, another problem remains with these representations. The feature vector contains large weights for terms

#A term's TF-IDf value is the product of its term frequency and inverse document freq. Can be implemented by class TfidVectorizer that wraps CountVectorizer
#and TfidTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=['The dog ate a sandwich and I ate a sandwich', 'the wizard transfigured a sandwich']
vectorizer=TfidfVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0.75458397 0.37729199 0.53689271 0.         0.        ]
 [0.         0.         0.44943642 0.6316672  0.6316672 ]]
{'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}


In [None]:
#hashvectorizer if tokens are not needed further for ML task because it maps tokens(words in document) to a column number in sparse matrix
#and it impossible retrieve the token back
from sklearn.feature_extraction.text import HashingVectorizer
corpus=['the','ate','bacon','cat']
vectorizer=HashingVectorizer(n_features=10) #select matrix size according the requirement, if n_features is small and corpus token are more than
#columns, it can cause problem of matrix collision. Deafult value is 2^20
print(vectorizer.fit_transform(corpus).todense())

[[ 0.  0.  0.  0.  0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]]
