# Implementation Of Word Embedding 

### Import Important Packages

In [5]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

### Word Embedding Class

In [14]:
class Word_Embedding:
  def __init__(self, context_word_size):
    self.window_size = 2*context_word_size + 1
    self.vocabulary = set()


  def preprocessing(self, data):
    preprocessed_data = {'data':[], 'labels':[]}
    tokenizer = RegexpTokenizer(r'\w+')
    for row in data:
      row = row.lower()
      list_ = tokenizer.tokenize(row)
      list_ = self.remove_digits(list_)
      self.add_to_vocabulary(list_)
      data, labels = self.extract_data_from_window(list_, self.window_size)
      preprocessed_data['data'] += data
      preprocessed_data['labels'] += labels

    for index in range(len(preprocessed_data['labels'])):
      preprocessed_data['data'][index] = self.sent_to_vec(preprocessed_data['data'][index], list(self.vocabulary))
      preprocessed_data['labels'][index] = self.word_to_vec(preprocessed_data['labels'][index], list(self.vocabulary))

    return preprocessed_data


  def add_to_vocabulary(self, list_):
    for item in list_:
      self.vocabulary.add(item)


  @staticmethod
  def remove_digits(list_):
    new_list = []
    for item in list_:
      if not item.isdigit():
        new_list.append(item)

    return new_list



  @staticmethod
  def extract_data_from_window(list_, window_size):
    extracted_data = ([],[])
    index = 0
    while index + window_size  <= len(list_):
      window = list_[index : index + window_size]
      center_word = window.pop((window_size + 1)//2)
      extracted_data[0].append(window)
      extracted_data[1].append(center_word)
      index += 1

    return extracted_data



  @staticmethod
  def sent_to_vec(list_, vocabulary):
    sent_vec = np.zeros(len(vocabulary))
    for word in list_:
      sent_vec += Word_Embedding.word_to_vec(word, vocabulary)
    
    return sent_vec / len(list_)

  @staticmethod
  def word_to_vec(word, vocabulary):
    if word not in vocabulary:
      raise ValueError(f'word {0} not in vocabulary list'.format(word))
    word_vec = np.zeros(len(vocabulary), int)
    word_vec[vocabulary.index(word)] = 1
    return word_vec


In [15]:
model = Word_Embedding(context_word_size=2)
model.preprocessing(['Play the Instrumental Study on Vimeo'])

{'data': [array([0.  , 0.25, 0.25, 0.  , 0.25, 0.25]),
  array([0.25, 0.  , 0.25, 0.25, 0.25, 0.  ])],
 'labels': [array([1, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 1])]}