# Data Preparation for Learning Embeddings with Continuous Bag of Words (CBOW)

In [1]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm.notebook import tqdm

In [2]:
args = Namespace(
    raw_dataset_txt="data/books/frankenstein.txt",
    window_size=5,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/books/frankenstein_with_splits.csv",
    seed=1337
)

In [3]:
# Split the raw text book into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')  # sentence tokenizer
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

In [4]:
print (len(sentences), "sentences")
print ("Sample:", sentences[0])

3427 sentences
Sample: Frankenstein,

or the Modern Prometheus


by

Mary Wollstonecraft (Godwin) Shelley


Letter 1


St. Petersburgh, Dec. 11th, 17--

TO Mrs. Saville, England

You will rejoice to hear that no disaster has accompanied the
commencement of an enterprise which you have regarded with such evil
forebodings.


In [5]:
# Clean sentences
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [6]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [7]:
cleaned_sentences[0]  

'frankenstein , or the modern prometheus by mary wollstonecraft godwin shelley letter st . petersburgh , dec . th , to mrs . saville , england you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings . '

### Generate 11 ngrams from each sentence, and create \[context, target\] lists 
- nltk.ngarms (["<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>", "frankenstein", ",", "or", "the", "modern", "prometheus", "by", ......, "evil", "forebodings", ".", "<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>"', 11)
- 1st: [ "<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>", <b>"frankenstein"</b>, ",", "or", "the", "modern", "prometheus"] 
- 2nd: [ "<MASK\>", "<MASK\>", "<MASK\>", "<MASK\>", "frankenstein", <b>","</b>, "or", "the", "modern", "prometheus", "by"] 
- Output from 1st: [", or the modern prometheus", "frankenstein"]
- Output from 2nd: ["frankenstein, or the modern prometheus by", ","]

In [8]:
# Global vars
MASK_TOKEN = "<MASK>"

In [9]:
# Create windows: use 11 ngrams
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm(cleaned_sentences)])

# Create cbow data
data = []
for window in tqdm(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
    
            
# Convert to dataframe
cbow_data = pd.DataFrame(data, columns=["context", "target"])

  0%|          | 0/3427 [00:00<?, ?it/s]

  0%|          | 0/90698 [00:00<?, ?it/s]

In [10]:
windows

[('<MASK>',
  '<MASK>',
  '<MASK>',
  '<MASK>',
  '<MASK>',
  'frankenstein',
  ',',
  'or',
  'the',
  'modern',
  'prometheus'),
 ('<MASK>',
  '<MASK>',
  '<MASK>',
  '<MASK>',
  'frankenstein',
  ',',
  'or',
  'the',
  'modern',
  'prometheus',
  'by'),
 ('<MASK>',
  '<MASK>',
  '<MASK>',
  'frankenstein',
  ',',
  'or',
  'the',
  'modern',
  'prometheus',
  'by',
  'mary'),
 ('<MASK>',
  '<MASK>',
  'frankenstein',
  ',',
  'or',
  'the',
  'modern',
  'prometheus',
  'by',
  'mary',
  'wollstonecraft'),
 ('<MASK>',
  'frankenstein',
  ',',
  'or',
  'the',
  'modern',
  'prometheus',
  'by',
  'mary',
  'wollstonecraft',
  'godwin'),
 ('frankenstein',
  ',',
  'or',
  'the',
  'modern',
  'prometheus',
  'by',
  'mary',
  'wollstonecraft',
  'godwin',
  'shelley'),
 (',',
  'or',
  'the',
  'modern',
  'prometheus',
  'by',
  'mary',
  'wollstonecraft',
  'godwin',
  'shelley',
  'letter'),
 ('or',
  'the',
  'modern',
  'prometheus',
  'by',
  'mary',
  'wollstonecraft',
  'god

In [11]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)  # axis=1: apply function to columns axis

In [12]:
cbow_data.head()  # in cbow, predict the target "frankenstein" with the context words ", or the modern prometheus"

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train


In [13]:
cleaned_sentences[0]  

'frankenstein , or the modern prometheus by mary wollstonecraft godwin shelley letter st . petersburgh , dec . th , to mrs . saville , england you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings . '

In [14]:
# Write split data to file
cbow_data.to_csv(args.output_munged_csv, index=False)