# Build Word2Vec

## Helpers

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from __future__ import print_function


import json
import os
import pandas
import io
import sys
import re


class ProgressBar(object):
    DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
    FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'

    def __init__(self, total, width=40, fmt=DEFAULT, symbol='=',
                 output=sys.stderr):
        assert len(symbol) == 1

        self.total = total
        self.width = width
        self.symbol = symbol
        self.output = output
        self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
            r'\g<name>%dd' % len(str(total)), fmt)

        self.current = 0

    def __call__(self):
        percent = self.current / float(self.total)
        size = int(self.width * percent)
        remaining = self.total - self.current
        bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']'

        args = {
            'total': self.total,
            'bar': bar,
            'current': self.current,
            'percent': percent * 100,
            'remaining': remaining
        }
        print('\r' + self.fmt % args, file=self.output, end='')

    def done(self):
        self.current = self.total
        self()
        print('', file=self.output)

In [0]:
default_path = "drive/My Drive/Hindi_News/"

#-------------------------------save/load--------------------------------------#
pickle_path = default_path + "pickles/"

import pickle

def save(obj , filename):
  print("saving {} ..".format(filename))
  with open(filename, 'wb') as handle:
      pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
      
def load(filename):
  print("loading {} ..".format(filename))
  with open(filename, 'rb') as handle:
    return pickle.load(handle)
#-----------------------------------------------------------------------------------#  

## Word Embedding

In [0]:
!unzip "drive/My Drive/Hindi_News/HindiNewsBook.zip" -d  "drive/My Drive/Hindi_News/"

Archive:  drive/My Drive/Hindi_News/HindiNewsBook.zip
  inflating: drive/My Drive/Hindi_News/HindiNewsBook.csv  


In [0]:
default_path = "drive/My Drive/Hindi_News/"
reviews_csv = default_path + "HindiNewsBook.csv"

In [0]:
import pandas as pd
reviews = pd.read_csv(reviews_csv)
reviews.head()

Unnamed: 0,सेtext,title
0,पाकिस्तान से खबरें संगीन रंगीन हैं। पापा जरदार...,बिलावल-हिना का खूबसूरत घोटाला
1,"इन 5 बातों का हर भाई रखे ख्याल, नहीं तो रिश्तो...",Read useful articles about relationships from ...
2,छत्तीसगढ़ में इंसानियत को शर्मसार कर देने वाली...,जमीनी विवाद में भाभी और भतीजी को उतारा मौत के घाट
3,एनसीसी की गर्ल कैडेट को पोर्न वीडियो भेजने पर ...,NCC गर्ल कैडेट को पोर्न क्लिप भेजता था मेजर जन...
4,"देश, दुनिया, खेल, बिजनेस और बॉलीवुड में क्‍या ...",Breaking News:एक क्लिक में पढ़ें गुरुवार दिनभर...


In [0]:
documents = reviews["सेtext"] 
summary = reviews["title"]
print(len(documents))
print(len(summary))

60347
60347


In [0]:
clean_documents = []
print("loading documents...")
progress = ProgressBar(len(documents), fmt=ProgressBar.FULL)
for doc in documents:
  clean_documents.append(str(doc).split())
  progress.current += 1
  progress()
progress.done()
save(clean_documents , "clean_documents_ketab.pkl")


clean_summary = []
print("loading summaries...")
progress = ProgressBar(len(summary), fmt=ProgressBar.FULL)
for doc in summary:
  clean_summary.append(str(doc).split())
  progress.current += 1
  progress()
progress.done()
save(clean_summary , "clean_summary_ketab.pkl")

loading documents...




saving clean_documents_ketab.pkl ..
loading summaries...




saving clean_summary_ketab.pkl ..


In [0]:
print(clean_documents[0])

['पाकिस्तान', 'से', 'खबरें', 'संगीन', 'रंगीन', 'हैं।', 'पापा', 'जरदारी', 'तमाम', 'तरह', 'के', 'घोटालों', 'में', 'उलझे', 'हुए', 'हैं।', 'बेटे', 'बिलावल', 'जरदारी', 'उम्र', 'में', 'अपने', 'से', 'दस', 'साल', 'बड़ी', 'पाकिस्तानी', 'विदेश', 'मंत्री', 'हिना', 'रब्बानी', 'के', 'खूबसूरत', 'बालों', 'में', 'उलझे', 'हुए', 'हैं।', 'यह', 'भी', 'एक', 'घोटाला', 'है।', 'इस', 'तरह', 'के', 'घोटाले', 'होते', 'रहें,', 'तो', 'अखबार', 'पढ़ना,', 'टीवी', 'न्यूज', 'चैनल', 'देखना', 'इंटरेस्टिंग', 'हो', 'जाता', 'है', 'जी।', 'बिलावल', 'के', 'साथ', 'हिना', 'के', 'फोटू', 'छपें', 'रोज', 'अखबार', 'में,', 'तो', 'अखबार', 'की', 'विजुअल', 'अपील', 'बढ़', 'जाती', 'है।', 'इनके', 'फोटू', 'देखकर', 'ये', 'खबर', 'पढ़ने', 'की', 'हिम्मत', 'आ', 'जाती', 'है', 'कि', 'भारतीय', 'मंत्रियों', 'ने', 'विदेशी', 'यात्राओं', 'में', 'तय', 'रकम', 'से', '12', 'गुना', 'ज्यादा', 'खर्च', 'किया।', 'ये', 'सच्ची', 'खबर', 'परेशान', 'करती', 'है।', 'हिना', 'और', 'बिलावल', 'के', 'बारे', 'में', 'गॉसिप', 'ही', 'सही,', 'फोटोजेनिक', 'राहत', 'देती', 'है।', 'ह

In [0]:
print(clean_summary[0])

['बिलावल-हिना', 'का', 'खूबसूरत', 'घोटाला']


In [0]:
default_path

'drive/My Drive/Hindi_News/'

In [0]:
clean_documents_list = clean_documents + clean_summary

In [0]:
len(clean_documents_list)

120694

In [0]:
len(clean_documents_list[0][10])

2

In [0]:
import gensim
model_arabic_vec = gensim.models.Word2Vec(
        clean_documents_list,
        size=150,
        window=10,
        min_count=2,
        workers=10)
model_arabic_vec.train(clean_documents_list, total_examples=len(clean_documents_list), epochs=10)
model_arabic_vec.wv.save(default_path +"model_hindi.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
default_path

'drive/Colab Notebooks/Model 4_5/'

In [0]:
model_arabic_vec.wv.save('drive/My Drive/Colab Notebooks/Model 4_5/' +"model_arabic_extreme.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load(default_path +"model_hindi.model", mmap='r')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
wv.most_similar(positive = "से")

  if np.issubdtype(vec.dtype, np.int):


[('उससे', 0.6536398530006409),
 ('इससे', 0.6046265363693237),
 ('इनसे', 0.5620524287223816),
 ('में', 0.5374647378921509),
 ('उनसे', 0.5367596745491028),
 ('authorised', 0.4722856879234314),
 ('blend', 0.4571074843406677),
 ('की', 0.44924241304397583),
 ('calculations', 0.44774317741394043),
 ('को', 0.4473797380924225)]

In [0]:
wv.most_similar(positive = "पाकिस्तान")

  if np.issubdtype(vec.dtype, np.int):


[('पाक', 0.8744723200798035),
 ('पाकिस्\u200dतान', 0.7458062171936035),
 ('अफगानिस्तान', 0.7245218753814697),
 ('पाकिस्तानी', 0.7201468348503113),
 ('चीन', 0.6863623857498169),
 ('पीओके', 0.6857386827468872),
 ('तालिबान', 0.6807665824890137),
 ('PAK', 0.6637619137763977),
 ('बांग्लादेश', 0.647534966468811),
 ('भारत', 0.6450281739234924)]

In [0]:
wv.most_similar(positive = "रंगीन")

  if np.issubdtype(vec.dtype, np.int):


[('जींस', 0.6590641140937805),
 ('रंग-बिरंगे', 0.6505199074745178),
 ('आकर्षक', 0.6378679275512695),
 ('कुर्ता', 0.6336793899536133),
 ('नीले', 0.6324629783630371),
 ('सफेद', 0.6312911510467529),
 ('रंग', 0.6301822066307068),
 ('पेंट', 0.625964879989624),
 ('रंग-बिरंगी', 0.6163167953491211),
 ('रंगों', 0.6155998706817627)]

## Build Vocab dict

In [0]:
!pip install nltk

In [0]:
import nltk
nltk.download('punkt')

In [0]:
from nltk.tokenize import word_tokenize
import collections

def build_dict(train_article_list,VOCAB_SIZE):
    vocab_counter = collections.Counter()

    progress = ProgressBar(len(train_article_list ), fmt=ProgressBar.FULL)
    for sentence in train_article_list :
        words = list()
        for word in word_tokenize(sentence):
            words.append(word)
        vocab_counter.update(words)
        progress.current += 1
        progress()
    progress.done()
    
    print ("Writing vocab file...")
    with open(os.path.join(pickle_path, "vocab"), 'w', encoding="utf-8") as writer:
      for word, count in vocab_counter.most_common(VOCAB_SIZE):
        writer.write(word + ' ' + str(count) + '\n')
    print ("Finished writing vocab file")
    return 

In [0]:
import pandas as pd
default_path = "drive/My Drive/Hindi_News/"
pickle_path = default_path + "pickles/"

reviews = pd.read_csv(default_path + "HindiNewsBook.csv")
reviews.shape
reviews.isnull().sum()
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)
reviews.head()

In [0]:
Text = []
Summary = []

progress = ProgressBar(len(reviews), fmt=ProgressBar.FULL)

for index , row in reviews.iterrows():
  Text.append(row["सेtext"])
  Summary.append(row["title"])
  progress.current += 1
  progress()
progress.done()

In [0]:
build_dict(reviews.सेtext,200000)