Word Embeddings or Word vectorization is a methodology in NLP to map words or phrases from vocabulary to a corresponding vector of real numbers which used to find word predictions, word similarities/semantics. The process of converting words into numbers are called Vectorization.

In [1]:
!pip install -U scikit-learn



In [2]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter



import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer






In [4]:
df = pd.read_csv('data.csv')

In [5]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


# Count Vectorizer

Count Vectorizer is a great tool provided by the scikit-learn library in Python. It is used to transform a
given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

In [6]:
cv = CountVectorizer()

In [7]:
x = cv.fit_transform(df['test'])

In [8]:
x

<4x14 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [9]:
x.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [10]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


In [11]:
df3 = df.copy()


In [12]:
df3

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


In [13]:

df2 = pd.DataFrame(x.toarray(), index=df['test'] , columns= cv.get_feature_names_out())


#  feature_names= x.columns 
#, columns=cv.get_feature_names()

In [14]:
df2.head()

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0,0,1,0,0,0,0,0,1,0,0,0,0,0
Could you give me an iphone?,1,0,0,1,1,0,0,1,0,1,0,0,0,1
Hello how are you?,0,1,0,0,0,1,1,0,0,0,0,0,0,1
I want to talk you.,0,0,0,0,0,0,0,0,0,0,1,1,1,1


In [15]:
columns=cv.get_feature_names_out()

In [16]:
columns

array(['an', 'are', 'bangladesh', 'could', 'give', 'hello', 'how',
       'iphone', 'love', 'me', 'talk', 'to', 'want', 'you'], dtype=object)

# TF-IDF

In [17]:
idf = TfidfVectorizer()

In [18]:
x = idf.fit_transform(df3['test'])

In [19]:
x.toarray()

array([[0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.43003652, 0.        , 0.        , 0.43003652, 0.43003652,
        0.        , 0.        , 0.43003652, 0.        , 0.43003652,
        0.        , 0.        , 0.        , 0.27448674],
       [0.        , 0.5417361 , 0.        , 0.        , 0.        ,
        0.5417361 , 0.5417361 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.34578314],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5417361 , 0.5417361 , 0.5417361 , 0.34578314]])

In [20]:
df4 = pd.DataFrame(x.toarray(), index=df['test'], columns=idf.get_feature_names_out())
df4

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
Could you give me an iphone?,0.430037,0.0,0.0,0.430037,0.430037,0.0,0.0,0.430037,0.0,0.430037,0.0,0.0,0.0,0.274487
Hello how are you?,0.0,0.541736,0.0,0.0,0.0,0.541736,0.541736,0.0,0.0,0.0,0.0,0.0,0.0,0.345783
I want to talk you.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541736,0.541736,0.541736,0.345783


# Word2Vec

In [21]:
!pip install gensim



In [22]:
from gensim.models import Word2Vec, KeyedVectors

In [23]:
import nltk

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
df = pd.read_csv('data.csv')

In [27]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


In [28]:
from nltk.tokenize import word_tokenize

In [29]:
text_vsc = [nltk.word_tokenize(test) for test in df['test']]

In [30]:
text_vsc

[['I', 'love', 'Bangladesh'],
 ['Could', 'you', 'give', 'me', 'an', 'iphone', '?'],
 ['Hello', 'how', 'are', 'you', '?'],
 ['I', 'want', 'to', 'talk', 'you', '.']]

In [31]:
model = Word2Vec(text_vsc, min_count=1)

In [32]:
model

<gensim.models.word2vec.Word2Vec at 0x22e0c8925b0>

In [33]:
model.wv.most_similar('Hello')

[('?', 0.17272651195526123),
 ('Bangladesh', 0.16695065796375275),
 ('give', 0.11118056625127792),
 ('talk', 0.10947787016630173),
 ('you', 0.07967709749937057),
 ('an', 0.04130829498171806),
 ('me', 0.03771401196718216),
 ('to', 0.013243556953966618),
 ('I', 0.008316051214933395),
 ('love', -0.005900920368731022)]