###**Count Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
sample_text = [
    'My name is Yash Kelkar',
    'I am in my final year of graduation',
    'I like to play video games',
    'My favourite outdoor sport is badminton'
]

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(sample_text)
X = X.toarray()

print(X.shape)
print(X)

(4, 20)
[[0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0]
 [1 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0]
 [0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0]]


In [None]:
# all unique words in our docs
cols = cv.get_feature_names()
print(cols)

['am', 'badminton', 'favourite', 'final', 'games', 'graduation', 'in', 'is', 'kelkar', 'like', 'my', 'name', 'of', 'outdoor', 'play', 'sport', 'to', 'video', 'yash', 'year']


###**N-gram Vectorization**

In [None]:
ngram = CountVectorizer(ngram_range=(1,3))
X = ngram.fit_transform(sample_text)
X = X.toarray()

print(X.shape)
print(X)

(4, 54)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
 [1 1 1 0 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0]
 [0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
  1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0]]


In [None]:
ngrams = ngram.get_feature_names()
print(ngrams)

['am', 'am in', 'am in my', 'badminton', 'favourite', 'favourite outdoor', 'favourite outdoor sport', 'final', 'final year', 'final year of', 'games', 'graduation', 'in', 'in my', 'in my final', 'is', 'is badminton', 'is yash', 'is yash kelkar', 'kelkar', 'like', 'like to', 'like to play', 'my', 'my favourite', 'my favourite outdoor', 'my final', 'my final year', 'my name', 'my name is', 'name', 'name is', 'name is yash', 'of', 'of graduation', 'outdoor', 'outdoor sport', 'outdoor sport is', 'play', 'play video', 'play video games', 'sport', 'sport is', 'sport is badminton', 'to', 'to play', 'to play video', 'video', 'video games', 'yash', 'yash kelkar', 'year', 'year of', 'year of graduation']


###**TF-IDF Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(sample_text)
X = X.toarray()

print(X.shape)
print(X)

(4, 20)
[[0.         0.         0.         0.         0.         0.
  0.         0.39278432 0.49819711 0.         0.31799276 0.49819711
  0.         0.         0.         0.         0.         0.
  0.49819711 0.        ]
 [0.39505606 0.         0.         0.39505606 0.         0.39505606
  0.39505606 0.         0.         0.         0.25215917 0.
  0.39505606 0.         0.         0.         0.         0.
  0.         0.39505606]
 [0.         0.         0.         0.         0.4472136  0.
  0.         0.         0.         0.4472136  0.         0.
  0.         0.         0.4472136  0.         0.4472136  0.4472136
  0.         0.        ]
 [0.         0.44592216 0.44592216 0.         0.         0.
  0.         0.35157015 0.         0.         0.28462634 0.
  0.         0.44592216 0.         0.44592216 0.         0.
  0.         0.        ]]


In [None]:
tfidf_unqiue_words = tfidf.get_feature_names()
print(tfidf_unqiue_words)

['am', 'badminton', 'favourite', 'final', 'games', 'graduation', 'in', 'is', 'kelkar', 'like', 'my', 'name', 'of', 'outdoor', 'play', 'sport', 'to', 'video', 'yash', 'year']


###**Word2Vec Vectorization**

In [None]:
from gensim.models import word2vec

In [None]:
for i, sentence in enumerate(sample_text):
	tokenized= []
	for word in sentence.split(' '):
		word = word.split('.')[0]
		word = word.lower()
		tokenized.append(word)
	sample_text[i] = tokenized

In [None]:
model = word2vec.Word2Vec(sample_text, workers = 1, size = 2, min_count = 1, window = 3, sg = 0)
similar_word = model.wv.most_similar('sport')[0]
print("Most common word to 'sport' is: {}".format(similar_word[0]))

Most common word to 'sport' is: play


In [None]:
w2v_matrix = model.wv.get_keras_embedding(train_embeddings=False)

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7feac2e5e510>