<a href="https://colab.research.google.com/github/yahyasungur/nlp_dl_ml_projects/blob/master/Train_the_model_for_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing the libraries

In [1]:
 import tensorflow as tf
 print(tf.__version__)

2.6.0


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import pandas as pd
import gensim
from gensim.models import Word2Vec, KeyedVectors

#Data preprocessing

In [4]:
# Google pretrained model
# https://www.kaggle.com/rootuser/worldnews-on-reddit

# install kaggle API
! pip install -q kaggle

In [5]:
# create a directory as kaggle
! mkdir -p ~/.kaggle

In [7]:
# copy API key to kaggle directory
! cp kaggle.json ~/.kaggle

In [8]:
# disable the API key
! chmod 600 /root/.kaggle/kaggle.json

In [9]:
# import the dataset
! kaggle datasets download -d rootuser/worldnews-on-reddit

Downloading worldnews-on-reddit.zip to /content
 64% 17.0M/26.6M [00:00<00:00, 37.2MB/s]
100% 26.6M/26.6M [00:00<00:00, 60.2MB/s]


In [10]:
# unzip the dataset
! unzip /content/worldnews-on-reddit

Archive:  /content/worldnews-on-reddit.zip
  inflating: reddit_worldnews_start_to_2016-11-22.csv  


In [11]:
df = pd.read_csv('/content/reddit_worldnews_start_to_2016-11-22.csv')

In [12]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [13]:
df.shape

(509236, 8)

In [14]:
# Take title column
news_titles = df['title'].values

In [15]:
news_titles

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border', ...,
       'Professor receives Arab Researchers Award',
       'Nigel Farage attacks response to Trump ambassador tweet',
       'Palestinian wielding knife shot dead in West Bank: Israel police'],
      dtype=object)

In [16]:
# tokanize the words
new_vec = [nltk.word_tokenize(title) for title in news_titles]

In [18]:
new_vec[0]

['Scores', 'killed', 'in', 'Pakistan', 'clashes']

#Building the model

In [19]:
model = Word2Vec(new_vec, min_count=1, size= 32)
# text, min word count, size of each vector

In [20]:
model

<gensim.models.word2vec.Word2Vec at 0x7f78e6f202d0>

#Predict the output

In [21]:
# this is how man is represented in our vec. space
model.wv['man']

array([ 0.5287505 , -4.591286  ,  3.2897542 ,  2.6874084 , -2.379098  ,
       -6.3917885 , -3.4140396 , -1.2572612 , -0.6393108 , -0.49477702,
        2.3877206 ,  1.3555714 , -2.0762777 ,  2.4935253 ,  2.9892993 ,
        0.6974515 ,  2.694891  ,  2.985051  ,  3.478391  , -4.3326845 ,
        2.517788  , -3.0983126 , -2.5817215 , -2.3182693 , -0.77152324,
        1.4536917 , -0.63020474,  2.3701575 , -1.3876622 ,  0.9905895 ,
       -3.2806592 , -3.4289536 ], dtype=float32)

In [22]:
#Find 10 closest words in the vector space we have created
model.wv.most_similar('man')

[('woman', 0.9723144173622131),
 ('girl', 0.9152591228485107),
 ('boy', 0.8874244093894958),
 ('couple', 0.8872882127761841),
 ('teacher', 0.8616616129875183),
 ('teenager', 0.8556163311004639),
 ('mother', 0.8528323173522949),
 ('doctor', 0.8494237661361694),
 ('father', 0.8221330642700195),
 ('daughter', 0.8193493485450745)]

In [23]:
# let's try the famous relationship
vec = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

[('office—and', 0.7965913414955139),
 ('parties', 0.7056441307067871),
 ('women', 0.6756975650787354),
 ('clerics', 0.6672356128692627),
 ('free-movement', 0.6571006774902344),
 ('campaigners', 0.6497793793678284),
 ('dialog', 0.636975884437561),
 ('Muslims', 0.6281842589378357),
 ('organisations', 0.6259411573410034),
 ('freedoms', 0.6215727925300598)]

In [24]:
#Relationship
vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])
# Germany - Capital of Germany + Capital of France => France :)

[('France', 0.8805805444717407),
 ('Germany', 0.8529587388038635),
 ('Belgium', 0.8399219512939453),
 ('Paris', 0.8355648517608643),
 ('Sweden', 0.829368531703949),
 ('Brussels', 0.7728636264801025),
 ('UK', 0.7484903931617737),
 ('Britain', 0.7471987009048462),
 ('Switzerland', 0.724898099899292),
 ('Norway', 0.7241060733795166)]

In [28]:
#Relationship
vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Basketball']
model.wv.most_similar([vec])

[('monochrome', 0.7187941074371338),
 ('malaysian', 0.7055420875549316),
 ('casings', 0.6774667501449585),
 ('oar', 0.6737223267555237),
 ('92-meter-long', 0.6639619469642639),
 ('Gorleben', 0.6628297567367554),
 ('becouse', 0.6437209844589233),
 ('90x', 0.6422110199928284),
 ('X-2', 0.6378070116043091),
 ('million-worth', 0.6374385356903076)]