#### Creating Corpus

In [1]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article, 'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ''

for p in paragraphs:
    article_text += p.text

#### Preprocessing

In [2]:
# cleaning the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)
processed_article = re.sub('\s+', ' ', processed_article)

# preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# removing stop words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

#### Creating Word2Vec Model

In [3]:
from gensim.models import Word2Vec
word2vec = Word2Vec(all_words, min_count=2)



### Model Analysis

#### Finding Vectors for a Word

In [4]:
v1 = word2vec.wv['artificial'] 
v1

array([-1.26915693e-03,  4.91977530e-03, -6.46667322e-03,  4.90196544e-05,
        8.55389796e-03,  2.72151735e-03, -2.41666497e-03,  9.05078836e-03,
       -1.11851944e-02,  5.59051055e-03, -5.79311606e-03, -6.88811438e-03,
        8.58534314e-03,  5.25715761e-04,  7.92549923e-03, -7.92449247e-03,
        5.95095148e-03,  6.46475237e-03, -8.98914784e-03, -1.06855361e-02,
       -6.01975387e-03, -3.01111978e-03, -2.95679690e-03, -1.03495680e-02,
        6.35865564e-03, -4.09139181e-03,  5.94742550e-03,  2.86687049e-03,
       -8.90225079e-03,  5.23004448e-03,  8.73540435e-03, -6.78104861e-03,
       -6.38524070e-03, -2.89040967e-03, -1.07049178e-02,  1.61441334e-03,
       -1.56674650e-06, -9.87156178e-04, -2.67625583e-04, -6.23120880e-03,
       -5.60139120e-03, -9.13153752e-04, -2.20395788e-03,  6.27979590e-03,
        6.40170369e-03,  2.09620059e-03, -1.05487509e-03, -2.41118018e-03,
       -2.52927095e-03,  9.41846884e-05,  2.92055961e-03, -4.48991451e-03,
       -6.31928351e-03, -

#### Finding Similar Words

In [5]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('domains', 0.3927774727344513),
 ('better', 0.3206420838832855),
 ('work', 0.30348891019821167),
 ('field', 0.2972853183746338),
 ('intelligent', 0.2891329526901245),
 ('known', 0.2856188118457794),
 ('ai', 0.2846320867538452),
 ('robotics', 0.2799379825592041),
 ('future', 0.27848365902900696),
 ('may', 0.27276137471199036)]