In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

#### *Toy example (model needs to find words corresponding to the location)*
---

##### Data & Preprocessing

In [2]:
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

    - tokenization
    - stop words removal
    - lowercasing
    - noise removal

In [3]:
def preprocessing(sent):
    return sent.lower().split()
 
train_sentences = [preprocessing(sentence) for sentence in corpus]

In [4]:
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

    location set:

In [5]:
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

train_labels = [[1 if word in locations else 0 for word in sentence] for sentence in train_sentences]

In [6]:
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

    vocabulary from sentences:

In [7]:
vocab = set(word for sentence in train_sentences for word in sentence)

vocab

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [8]:
vocab.add("<unk>")

    Padding for window classification

In [9]:
vocab.add('<pad>')

In [10]:
def add_padding(sentence, window_size, pad='<pad>'):
    window = [pad] * window_size
    return window + sentence + window

In [11]:
words = sorted(list(vocab))

words_to_indx = {word:indx for indx, word in enumerate(words)}

In [12]:
words_to_indx

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

    converting sentences into sequence of indices

In [23]:
def sent_to_ind(sentence, words_indexes):
    indexed_sentence = []
    for word in sentence:
        if word in words_indexes:
            index = words_to_indx[word]
        else:
            index = words_to_indx['<unk>']
        indexed_sentence.append(index)

    return indexed_sentence

In [24]:
indexed_sent = sent_to_ind(train_sentences[0], words_to_indx)
restored_sent = [words[indx] for indx in indexed_sent]

print('indexed sentence from train corpus: ', indexed_sent)
print('from indecis to words: ', restored_sent)

indexed sentence from train corpus:  [22, 2, 6, 20, 15]
from indecis to words:  ['we', 'always', 'come', 'to', 'paris']


In [26]:
train_indecies = [sent_to_ind(sentence, words_to_indx) for sentence in train_sentences]
train_indecies

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [31]:
import pprint

In [36]:
print('Train tokens:')
pprint.pprint(train_sentences)
print('-------------')

print('Train labels for locations:')
pprint.pprint(train_labels)
print('-------------')

print('Train sentences turned into sequence of indecies:')
pprint.pprint(train_indecies)
print('-------------')

Train tokens:
[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]
-------------
Train labels for locations:
[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]
-------------
Train sentences turned into sequence of indecies:
[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]
-------------


##### Batching