# Importing libraries

In [1]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action = 'ignore')
import gensim
from gensim.models import Word2Vec
import re
import bs4 as bs
import urllib.request
import nltk

In [2]:
nltk.download('punkt')
#Divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences
nltk.download('stopwords')
#Natural Language Toolkit(Stopwords)

[nltk_data] Downloading package punkt to C:\Users\YASH kAILAS
[nltk_data]     DHADGE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\YASH kAILAS
[nltk_data]     DHADGE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text 

In [3]:
sentences = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells"""

# Preparation Of Dataset

01) Cleaning Of data

In [4]:
#Remove one letter words
sentences = re.sub('[^A-Za-z0-9]+', ' ', sentences)

#Remove special characters
sentences = re.sub(r'(?:^| )\w(?:$| )', ' ', sentences).strip()
print(sentences)

We are about to study the idea of computational process Computational processes are abstract beings that inhabit computers As they evolve processes manipulate other abstract things called data The evolution of process is directed by pattern of rules called program People create programs to direct processes In effect we conjure the spirits of the computer with our spells


02) Converting to lowercase

In [5]:
#Convert all letters to lowercase
sentences = sentences.lower()

03) Tokenize sentences and words

In [6]:
#Tokenize sentences (Tokenization is the act of breaking up a sequence of strings into pieces)
all_sent=nltk.sent_tokenize(sentences)
print(all_sent)

['we are about to study the idea of computational process computational processes are abstract beings that inhabit computers as they evolve processes manipulate other abstract things called data the evolution of process is directed by pattern of rules called program people create programs to direct processes in effect we conjure the spirits of the computer with our spells']


In [7]:
#Break sentences into words
all_words=[nltk.word_tokenize(sent) for sent in all_sent]

04) Removal Of Stopwards

In [8]:
#Remove stopwords
from nltk.corpus import stopwords
for i in range(len(all_words)):
  all_words[i]=[w for w in all_words[i] if w not in stopwords.words('english')]
data =all_words

# Train the model

In [9]:
#Training model using gensim
model1 = gensim.models.Word2Vec(data, min_count = 1,size = 52, window = 6)

In [10]:
#Finding similar words to given word
word='study'
v1=model1.wv[word]
similar_words=model1.wv.most_similar(word)
for x in similar_words:
  print(x)

('inhabit', 0.28260788321495056)
('spirits', 0.2628842890262604)
('idea', 0.17500905692577362)
('rules', 0.16853827238082886)
('spells', 0.152943953871727)
('create', 0.14936180412769318)
('process', 0.14154373109340668)
('directed', 0.11710277199745178)
('people', 0.1048392802476883)
('conjure', 0.09803693741559982)


In [11]:
split_data = data[0]
print(split_data)

['study', 'idea', 'computational', 'process', 'computational', 'processes', 'abstract', 'beings', 'inhabit', 'computers', 'evolve', 'processes', 'manipulate', 'abstract', 'things', 'called', 'data', 'evolution', 'process', 'directed', 'pattern', 'rules', 'called', 'program', 'people', 'create', 'programs', 'direct', 'processes', 'effect', 'conjure', 'spirits', 'computer', 'spells']


In [13]:
#Preparing list of context words

Context_Target_Data = []
for i in range(2, len(split_data) - 2):
    context = [split_data[i - 2], split_data[i - 1], split_data[i+1], split_data[i + 2]]
    target = split_data[i]
    Context_Target_Data.append((context, target))
print(Context_Target_Data)

[(['study', 'idea', 'process', 'computational'], 'computational'), (['idea', 'computational', 'computational', 'processes'], 'process'), (['computational', 'process', 'processes', 'abstract'], 'computational'), (['process', 'computational', 'abstract', 'beings'], 'processes'), (['computational', 'processes', 'beings', 'inhabit'], 'abstract'), (['processes', 'abstract', 'inhabit', 'computers'], 'beings'), (['abstract', 'beings', 'computers', 'evolve'], 'inhabit'), (['beings', 'inhabit', 'evolve', 'processes'], 'computers'), (['inhabit', 'computers', 'processes', 'manipulate'], 'evolve'), (['computers', 'evolve', 'manipulate', 'abstract'], 'processes'), (['evolve', 'processes', 'abstract', 'things'], 'manipulate'), (['processes', 'manipulate', 'things', 'called'], 'abstract'), (['manipulate', 'abstract', 'called', 'data'], 'things'), (['abstract', 'things', 'data', 'evolution'], 'called'), (['things', 'called', 'evolution', 'process'], 'data'), (['called', 'data', 'process', 'directed'],

# Predicting Word

In [14]:
#Predicting current word from context words
i=3
print(Context_Target_Data[i][0],Context_Target_Data[i][1])
print(model1.predict_output_word(Context_Target_Data[i][0]))

['process', 'computational', 'abstract', 'beings'] processes
[('computational', 0.035714526), ('pattern', 0.035714474), ('rules', 0.0357144), ('evolution', 0.035714377), ('spells', 0.035714366), ('beings', 0.035714354), ('create', 0.03571433), ('called', 0.035714325), ('direct', 0.035714317), ('directed', 0.035714317)]
