# Build Word2Vec

## Helpers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from __future__ import print_function

import json
import os
import pandas
import io
import sys
import re

class ProgressBar(object):
    DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
    FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'

    def __init__(self, total, width=40, fmt=DEFAULT, symbol='=',
                 output=sys.stderr):
        assert len(symbol) == 1

        self.total = total
        self.width = width
        self.symbol = symbol
        self.output = output
        self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
            r'\g<name>%dd' % len(str(total)), fmt)

        self.current = 0

    def __call__(self):
        percent = self.current / float(self.total)
        size = int(self.width * percent)
        remaining = self.total - self.current
        bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']'

        args = {
            'total': self.total,
            'bar': bar,
            'current': self.current,
            'percent': percent * 100,
            'remaining': remaining
        }
        print('\r' + self.fmt % args, file=self.output, end='')

    def done(self):
        self.current = self.total
        self()
        print('', file=self.output)

In [0]:
default_path = "drive/My Drive/Amhari/"

#-------------------------------save/load--------------------------------------#
pickle_path = default_path + "pickles/"

import pickle

def save(obj , filename):
  print("saving {} ..".format(filename))
  with open(filename, 'wb') as handle:
      pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
      
def load(filename):
  print("loading {} ..".format(filename))
  with open(filename, 'rb') as handle:
    return pickle.load(handle)
#-----------------------------------------------------------------------------------#  

## Word Embedding

In [0]:
default_path = "drive/My Drive/Amhari/"
reviews_csv = default_path + "amhari_cleaner.csv"

In [0]:
import pandas as pd
reviews = pd.read_csv(reviews_csv)
reviews.head()

Unnamed: 0,title,content
0,ሰንደቅ፡ዓላማችንን,አረንጓዴ፡ብጫና፡ቀይ፡ሰንደቅ፡ዓላማችን፡ የአንድ፡ኢትዮጵያና፡ የነፃነት፡ ም...
1,ለዚህ መንግሥት ታላቅ ክብር አለኝ,ለኢትዮጵያ የጥምር መንግስት ያስፈልጋታል በማለት በ ምርጫ አጥብቆ ሲከራከ...
2,ግራውንድ ሲቀነስ አንድ,"ቀጭኑ ዘቄራ , ለአዲስ አበባ ከተማ ክብር ያላችሁ እንስማማለን። አዲስ ..."
3,ዶላር አዘዋዋሪዎቹ ደላሎች,ህወሓት የአጋር ፓርቲዎቹን የንግድ ተቋማትና ልማታዊ ባለሀብት እያለ የሚጠ...
4,ታላቅ ዕድል ነው,ዛሬ አርብ ልክ ከጠዋቱ ፡ ላይ የኢህአዴግ ወኪሎችና ኢህአዴግን እንደሚደ...


In [0]:
documents = reviews["content"] 
summary = reviews["title"]
print(len(documents))
print(len(summary))

53953
53953


In [0]:
import re
text = "ምልክታችን፡ነው።አረንጓዴው፡"

print(text)

ምልክታችን ፡ ነው ። አረንጓዴው ፡ 


In [0]:
def amhari_splitter(doc):
  hah = u'፡'
  space_hah = u' ፡ '
  doc = re.sub(hah,space_hah, doc)
  hah = u'።'
  space_hah = u' ። '
  doc = re.sub(hah,space_hah, doc)
  hah = u'፣'
  space_hah = u' ፣ '
  doc = re.sub(hah,space_hah, doc)
  hah = u'፤'
  space_hah = u' ፤ '
  doc = re.sub(hah,space_hah, doc)
  doc = doc.split(" ")
  return doc
amhari_splitter("ምልክታችን፡ነው።አረንጓዴው፡ ምልክታችን፡ነው።አረንጓዴው፡")

['ምልክታችን',
 '፡',
 'ነው',
 '።',
 'አረንጓዴው',
 '፡',
 '',
 'ምልክታችን',
 '፡',
 'ነው',
 '።',
 'አረንጓዴው',
 '፡',
 '']

In [0]:
clean_documents = []
print("loading documents...")
progress = ProgressBar(len(documents), fmt=ProgressBar.FULL)
for doc in documents:
  clean_documents.append(amhari_splitter(str(doc)))
  progress.current += 1
  progress()
progress.done()
save(clean_documents , "clean_documents_ketab.pkl")


clean_summary = []
print("loading summaries...")
progress = ProgressBar(len(summary), fmt=ProgressBar.FULL)
for doc in summary:
  clean_summary.append(amhari_splitter(str(doc)))
  progress.current += 1
  progress()
progress.done()
save(clean_summary , "clean_summary_ketab.pkl")

loading documents...




saving clean_documents_ketab.pkl ..
loading summaries...




saving clean_summary_ketab.pkl ..


In [0]:
Text = []
Summary = []

progress = ProgressBar(len(clean_summary), fmt=ProgressBar.FULL)

i=0
for sum_ in clean_summary:
  Text.append(" ".join(clean_documents[i]))
  Summary.append(" ".join(clean_summary[i]))
  i+=1
  progress.current += 1
  progress()
progress.done()



In [0]:
Data = {
        'Text':  Text,
        'Summary': Summary
        }

df = pd.DataFrame(Data, columns = ['Text','Summary'])
df.to_csv("drive/My Drive/Amhari/Amhari_clean_tokenized.csv")
df.head()

Unnamed: 0,Text,Summary
0,አረንጓዴ ፡ ብጫና ፡ ቀይ ፡ ሰንደቅ ፡ ዓላማችን ፡ የአንድ ፡ ኢትዮጵ...,ሰንደቅ ፡ ዓላማችንን
1,ለኢትዮጵያ የጥምር መንግስት ያስፈልጋታል በማለት በ ምርጫ አጥብቆ ሲከራከ...,ለዚህ መንግሥት ታላቅ ክብር አለኝ
2,"ቀጭኑ ዘቄራ , ለአዲስ አበባ ከተማ ክብር ያላችሁ እንስማማለን ። አዲ...",ግራውንድ ሲቀነስ አንድ
3,ህወሓት የአጋር ፓርቲዎቹን የንግድ ተቋማትና ልማታዊ ባለሀብት እያለ የሚጠ...,ዶላር አዘዋዋሪዎቹ ደላሎች
4,ዛሬ አርብ ልክ ከጠዋቱ ፡ ላይ የኢህአዴግ ወኪሎችና ኢህአዴግን እንደ...,ታላቅ ዕድል ነው


In [0]:
print(clean_documents[10:20])

[['የኢትዮጵያ', 'የመከላከያ', 'ኃይል', 'በወገኖቹ', 'የሚፈቀር', '፣', '', 'የሚከበርና', 'አስፈላጊው', 'ሁሉ', 'ድጋፍ', 'የሚደረግለት', 'የህዝብና', 'የአገር', 'አለኝታ', 'ነው', '።', '', 'ክቡር', 'ህይወታቸውን', 'ሳይሳሱ', 'ላገራቸው', 'ሲሉ', 'አሳልፈው', 'የሚሰጡትን', 'ውድ', 'የኢትዮጵያ', 'ልጆች', 'የምናከብራቸውና', 'ልንከባከባቸው', 'የሚገባው', 'ልክ', 'ያይናችን', 'ብሌን', 'ያህል', 'ነው', '።', '', 'ክቡር', 'ለውድ', 'ኢትዮጵያዊ', 'ሰማዕታት', 'ሲባል', 'እንዲሁ', 'መፈክር', 'ለማለት', 'ሳይሆን', 'ከልባችን', '፣', '', 'ከውስጣችን', '፣', '', 'የመስዋዕትነታቸውን', 'ክብር', 'ከቶውንም', 'ልንዘነጋው', 'የሚቻለን', 'ባለመሆኑ', 'ነው', '።', '', 'ሌላ', 'ምክንያት', 'የለውም', '።', '', 'በቃ', ',', 'የምንኮራበት', 'መከላከያ', 'እንዲኖረን', 'አጥብቀን', 'ስለምንመኝ', 'ያገር', 'ኩራት', 'የሆነው', 'መከላከያችን', 'በክልል', '፣', '', 'በዞንና', 'በጎጥ', 'አንሶና', 'የተዋረደ', 'ስብዕና', 'ተላብሶ', 'እንዲዋቀር', 'ኢትዮጵያውያን', 'አይፈቅዱም', '።', '', 'አገራችን', 'ከምትገኝበት', 'የጂኦ', 'ፖለቲካል', 'አቀማመጥ', 'አንጻር', 'ብሄራዊ', 'ስብዕና', 'ያለው', '፣', '', 'ህዝብንና', 'አገርን', 'የሚያስቀድም', 'የመከላከያ', 'ኃይል', 'ሊኖረን', 'ይገባልና', '፡', '', '፡', '', 'በጎጥና', 'በዘር', 'ፖለቲካ', 'የተተበተበ', 'አመለካከት', 'ያላቸው', 'ክፍሎች', 'በመከላከያ', 'ሃይላችን', 'እንዲቀልዱበት', 'አንፈቅድም', '።', '', 'ክብሩንና', 'ሞ

In [0]:
clean_documents_list = clean_documents + clean_summary

In [0]:
len(clean_documents_list)

107906

In [0]:
len(clean_documents_list[0][10])

2

In [0]:
import gensim
model_arabic_vec = gensim.models.Word2Vec(
        clean_documents_list,
        size=150,
        window=10,
        min_count=2,
        workers=10)
model_arabic_vec.train(clean_documents_list, total_examples=len(clean_documents_list), epochs=10)
model_arabic_vec.wv.save(default_path +"model_amhari.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
default_path

'drive/Colab Notebooks/Model 4_5/'

In [0]:
model_arabic_vec.wv.save('drive/My Drive/Colab Notebooks/Model 4_5/' +"model_arabic_extreme.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load(default_path +"model_amhari.model", mmap='r')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
wv.most_similar(positive = "ሰንደቅ")

  if np.issubdtype(vec.dtype, np.int):


[('ሠንደቅ', 0.7856553792953491),
 ('የሰንደቅ', 0.7219874262809753),
 ('ባንዲራ', 0.7212193012237549),
 ('በሰንደቅ', 0.6958413124084473),
 ('ሰንድቅ', 0.6467265486717224),
 ('ሰንደቀ', 0.6370008587837219),
 ('ስንደቅ', 0.6261555552482605),
 ('ሲውለበለብ', 0.6252806186676025),
 ('ሰንደቃ', 0.595197319984436),
 ('ሰነደቅ', 0.594842255115509)]

In [0]:
wv.most_similar(positive = "የሚከበርና")

  if np.issubdtype(vec.dtype, np.int):


[('የሚፈራ', 0.56520015001297),
 ('ለሃይማኖቱ', 0.5293850898742676),
 ('የማይፈራ', 0.5136642456054688),
 ('ፈርሃ', 0.4975281357765198),
 ('አምላኩን', 0.4966278672218323),
 ('ለህሊናው', 0.49339208006858826),
 ('በአምላኩ', 0.4866925776004791),
 ('አስተዋይም', 0.47913244366645813),
 ('ጠቢብ', 0.47402289509773254),
 ('ሩህሩህ', 0.4598575532436371)]

In [0]:
wv.most_similar(positive = "ፖም")

  if np.issubdtype(vec.dtype, np.int):


[('ቅባት', 0.7452839612960815),
 ('ካልሲየም', 0.7418216466903687),
 ('አሲድ', 0.7347085475921631),
 ('ፓፓያ', 0.7292129397392273),
 ('ስብ', 0.7249459028244019),
 ('ቆዳን', 0.7228017449378967),
 ('አቮካዶ', 0.7200249433517456),
 ('ኦቾሎኒ', 0.7167451977729797),
 ('ጉበት', 0.7144170999526978),
 ('በቫይታሚን', 0.701026201248169)]

In [0]:
wv.most_similar(positive = "ጠጣ") #apple

  if np.issubdtype(vec.dtype, np.int):


[('እጠጣለሁ', 0.7217596769332886),
 ('ጠጪ', 0.7116069197654724),
 ('አይጠጣም', 0.7000353336334229),
 ('ዱለት', 0.696423351764679),
 ('ይጠጣል', 0.6935624480247498),
 ('ጠላውን', 0.6934843063354492),
 ('ቢራውን', 0.6891821622848511),
 ('ጠጥተህ', 0.6879841685295105),
 ('ቆሎውን', 0.6856545805931091),
 ('አረቄ', 0.6807674765586853)]

In [0]:
wv.most_similar(positive = "ግብጽ") #egypt

  if np.issubdtype(vec.dtype, np.int):


[('ግብፅ', 0.8652640581130981),
 ('ግብጽን', 0.7479394674301147),
 ('ኢራን', 0.6770453453063965),
 ('ቱርክ', 0.670718789100647),
 ('ፓኪስታን', 0.6571879982948303),
 ('ሩሲያ', 0.6370677351951599),
 ('ሱማሊያ', 0.6299570202827454),
 ('ከግብጽ', 0.6282040476799011),
 ('የግብጽ', 0.627738893032074),
 ('ግብፅም', 0.6252204775810242)]

## Build Vocab dict

In [0]:
!pip install nltk

In [0]:
import nltk
nltk.download('punkt')

In [0]:
from nltk.tokenize import word_tokenize
import collections

def build_dict(train_article_list,VOCAB_SIZE):
    vocab_counter = collections.Counter()

    progress = ProgressBar(len(train_article_list ), fmt=ProgressBar.FULL)
    for sentence in train_article_list :
        words = list()
        for word in word_tokenize(sentence):
            words.append(word)
        vocab_counter.update(words)
        progress.current += 1
        progress()
    progress.done()
    
    print ("Writing vocab file...")
    with open(os.path.join(pickle_path, "vocab"), 'w', encoding="utf-8") as writer:
      for word, count in vocab_counter.most_common(VOCAB_SIZE):
        writer.write(word + ' ' + str(count) + '\n')
    print ("Finished writing vocab file")
    return 

In [0]:
import pandas as pd
default_path = "drive/My Drive/Hindi_News/"
pickle_path = default_path + "pickles/"

reviews = pd.read_csv(default_path + "HindiNewsBook.csv")
reviews.shape
reviews.isnull().sum()
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)
reviews.head()

In [0]:
Text = []
Summary = []

progress = ProgressBar(len(reviews), fmt=ProgressBar.FULL)

for index , row in reviews.iterrows():
  Text.append(row["सेtext"])
  Summary.append(row["title"])
  progress.current += 1
  progress()
progress.done()

In [0]:
build_dict(reviews.सेtext,200000)