#### Creating Corpus

In [1]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article, 'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ''

for p in paragraphs:
    article_text += p.text

#### Preprocessing

In [2]:
# cleaning the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)
processed_article = re.sub('\s+', ' ', processed_article)

# preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# removing stop words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

#### Creating Word2Vec Model

In [3]:
from gensim.models import Word2Vec
word2vec = Word2Vec(all_words, min_count=2)



In [4]:
vocabulary = word2vec.wv.key_to_index
print(vocabulary)

{'ai': 0, 'intelligence': 1, 'artificial': 2, 'human': 3, 'research': 4, 'machine': 5, 'also': 6, 'learning': 7, 'machines': 8, 'use': 9, 'humans': 10, 'many': 11, 'knowledge': 12, 'would': 13, 'could': 14, 'problems': 15, 'problem': 16, 'ethics': 17, 'field': 18, 'researchers': 19, 'mind': 20, 'computer': 21, 'may': 22, 'systems': 23, 'approaches': 24, 'jobs': 25, 'data': 26, 'intelligent': 27, 'technology': 28, 'used': 29, 'general': 30, 'people': 31, 'one': 32, 'agent': 33, 'example': 34, 'risk': 35, 'symbolic': 36, 'mathematical': 37, 'goal': 38, 'ability': 39, 'statistical': 40, 'processing': 41, 'language': 42, 'particular': 43, 'even': 44, 'term': 45, 'theory': 46, 'computers': 47, 'understanding': 48, 'u': 49, 'world': 50, 'future': 51, 'different': 52, 'ethical': 53, 'behavior': 54, 'like': 55, 'form': 56, 'beings': 57, 'goals': 58, 'reasoning': 59, 'based': 60, 'networks': 61, 'system': 62, 'neural': 63, 'robotics': 64, 'natural': 65, 'often': 66, 'computing': 67, 'level': 68

### Model Analysis

#### Finding Vectors for a Word

In [5]:
v1 = word2vec.wv['artificial'] 
v1

array([-2.01807288e-03,  5.19455690e-03, -6.01227069e-03,  2.79025699e-04,
        8.17465037e-03,  1.22153980e-03, -2.31136871e-03,  9.91391018e-03,
       -1.17080137e-02,  5.79966931e-03, -5.70119685e-03, -7.25316815e-03,
        8.39317031e-03,  6.08909293e-04,  7.75027787e-03, -8.04186799e-03,
        5.67906909e-03,  6.05684286e-03, -9.20512620e-03, -1.22809596e-02,
       -5.42160450e-03, -3.03369667e-03, -2.64378032e-03, -1.04579916e-02,
        6.58931024e-03, -4.50932514e-03,  5.53296693e-03,  3.19430954e-03,
       -9.94195137e-03,  5.32524707e-03,  9.79914702e-03, -5.90901356e-03,
       -6.61846809e-03, -3.13766114e-03, -1.10205058e-02,  2.17727059e-03,
       -2.80537759e-04, -1.98995019e-03, -1.56764488e-03, -7.59794982e-03,
       -6.38032472e-03, -1.09685806e-03, -2.54757004e-03,  6.17608242e-03,
        7.39534851e-03,  1.81494234e-03, -2.04296596e-03, -2.54912861e-03,
       -2.43983581e-03,  1.08093908e-03,  2.56498647e-03, -4.24506236e-03,
       -6.75639091e-03, -

#### Finding Similar Words

In [6]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('domains', 0.3882658779621124),
 ('ai', 0.3679261803627014),
 ('better', 0.3447827100753784),
 ('work', 0.3444591164588928),
 ('may', 0.31940072774887085),
 ('field', 0.314007967710495),
 ('intelligent', 0.30112627148628235),
 ('ethics', 0.29427751898765564),
 ('would', 0.29421427845954895),
 ('robotics', 0.29368504881858826)]