# Advanced Natural Processing
---
## Assignment 1

#### Tejasvi Chebrolu
#### 2019114005

### Word Vectors Using CBOW


In [22]:
import json
import re
import numpy as np
from scipy import sparse
from tqdm import tqdm
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from google.colab import drive
import gensim

## Loading The Dataset
---

The dataset is loaded as a json file.



In [23]:
def get_data_from_json(path):
  data = []
  f = open(path)
  for a in f:
    data.append(json.loads(a))
  return data

## Mounting The Drive

---

The dataset is on google drive and hence needs to be mounted.

In [24]:
def mount_drive():
  drive.mount('/content/drive')

## Data Preprocessing
---

The data was cleaned as follows:
- Everything was converted into lowercase.
- Punctuations were removed from the data.

Lists containing the words, and the sentences were returned.

In [25]:
def clean_data(data, length):
  punctuations='''!()-[]{};:'"\,<>./?@#$%^&*_~+='''
  final_text = []
  docs = []
  for a in tqdm(data[:length]):
    text = (a['reviewText']).lower()
    for x in text:
      if x in punctuations:
        text = text.replace(x, " ")
    sentence = text.split()
    docs.append(sentence)
    for word in sentence:
      final_text.append(word)
  
  return final_text, docs

## Generate The Contexts and Labels

Return an iterable that creates in generating the data.

In [26]:


def generate_data(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)



## Entry Point

In [27]:
mount_drive()
data = get_data_from_json('/content/drive/MyDrive/Electronics_5.json')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
all_text, docs = clean_data(data, 2000)

100%|██████████| 2000/2000 [00:00<00:00, 18327.30it/s]


## Hyperparameters

The hyperparameters are defined

In [29]:
Tokenizer().fit_on_texts(docs)
corpus = Tokenizer().texts_to_sequences(docs)
nb_samples = sum(len(s) for s in corpus)
dim = 100
window_size = 2
epochs = 1
V = len(Tokenizer().word_index) + 1

In [30]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [32]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [33]:
for ite in range(epochs):
    loss = 0.
    for x, y in tqdm(generate_data(corpus, window_size, V)):
        loss += cbow.train_on_batch(x, y)

227505it [29:45, 127.43it/s]

0 2099166.995997429





In [35]:
vector_file = open('/content/drive/MyDrive/embeddings1.txt', 'w')
vector_file.write('{} {}\n'.format(V-1, dim))

10

In [36]:
vectors = cbow.get_weights()[0]
for word, i in Tokenizer().word_index.items():
    vector_file.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
vector_file.close()

In [37]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/embeddings1.txt', binary=False)

## Evaluating The Model

## The words chose are:

- laptop (noun)
- camera (noun)
- weak (verb)
- buy (verb)
- amazing(adjective)
- tv (noun)

The 10 closest words are printed for each of the word, along with the word included.

In [42]:
WORDS = ['laptop', 'camera', 'weak', 'buy', 'amazing', 'tv']
print("-" * 20)
for word in WORDS:
  cw = w2v.most_similar(positive=[word])
  print("The word is: ", word)
  print("-" * 20)
  for i, nw in enumerate(cw):
    print("{}. {}".format(i + 1, nw[0]))
  print("-" * 20)

--------------------
The word is:  laptop
--------------------
1. thatcher
2. endemic
3. einkconclusion
4. carry
5. sideload
6. fiber
7. coverage
8. annoyance
9. manuals
10. crack
--------------------
The word is:  camera
--------------------
1. confidence
2. entirely
3. shutter
4. 63
5. 283
6. booksyou
7. togther
8. slight
9. lyrics
10. clunkiness
--------------------
The word is:  weak
--------------------
1. okvery
2. purely
3. avoiding
4. regretted
5. hear
6. sides
7. thanwe
8. road4
9. elite
10. bookmark
--------------------
The word is:  buy
--------------------
1. remembered
2. needs
3. retailed
4. neglected
5. burger
6. plywood
7. that
8. pieces
9. googled
10. kunu
--------------------
The word is:  amazing
--------------------
1. displaying
2. printer
3. bags
4. nights
5. monkey
6. conclusion
7. marware
8. eyesight
9. chosen
10. football
--------------------
The word is:  tv
--------------------
1. tricks
2. different
3. album
4. t101mt
5. superlative
6. settings
7. cleaned
8.

---
