In [1]:
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [2]:
def process_tweet(tweet):
  tweet=tweet.lower().split()
  return tweet

In [3]:
train_sentences=[process_tweet(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [4]:
location=set(['paris','australia','stanford','taiwan','turkey','ankara'])
train_labels=[[1 if word in location else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [5]:
vocabulary=set(word for sen in train_sentences for word in sen)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [7]:
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [9]:
def pad_window(sentence, window_size, pad_token="<pad>"):
  window = [pad_token] * window_size
  return window + sentence + window
#example
window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [14]:
ix_to_word=sorted(list(vocabulary))
word_to_ix={word: ind for ind,word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [17]:
def convert_token_to_indices(sentence,word_to_ix):
  indices=[]
  for word in sentence:
    if word in word_to_ix:
      indices.append(word_to_ix[word])
    else:
      indices.append(word_to_ix['<unk>'])
  return indices
#exam:
example_sentence = ["we", "always", "come", "to", "vietnam"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'vietnam']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [18]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [19]:
import torch
import torch.nn as nn

In [21]:
em_dim=5
embed=nn.Embedding(len(vocabulary),em_dim)
list(embedding.parameters())


[Parameter containing:
 tensor([[ 0.6160, -0.2080, -1.6142, -0.9373, -0.0055],
         [ 0.0614,  0.9033,  0.3914, -0.8155,  0.6953],
         [ 0.7198,  1.2353,  0.5529,  0.6118, -0.1846],
         [ 0.3185, -0.9831, -0.9280, -2.0062, -0.5723],
         [-1.7283,  0.1293,  1.0874, -0.6370,  0.0566],
         [ 0.1286, -0.2925,  0.0219,  0.2959, -1.9710],
         [-0.1609,  0.0992, -0.8642,  0.9061,  0.3893],
         [-2.8071,  1.0059, -0.4078,  1.9134, -0.6253],
         [-0.9858, -0.6747,  0.0793, -0.5238, -1.4572],
         [ 0.9852,  1.2978,  0.0392, -1.7899, -0.1735],
         [ 0.5650, -0.3109,  2.0656, -0.6265,  1.5252],
         [ 0.5104,  0.9330, -1.9530, -0.0903, -0.4083],
         [ 0.6649, -1.1141, -0.1920,  0.0625,  1.1340],
         [ 1.2755,  0.7030, -2.2760, -0.2597,  0.3915],
         [ 1.4285,  1.0132,  0.9300, -1.1664,  0.4304],
         [ 0.1188, -1.3351,  0.7976, -0.4192, -0.7418],
         [ 0.6926, -1.3075, -0.8733,  1.0079, -0.6871],
         [ 1.6813,  0.617

In [22]:
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embed(index_tensor)
paris_embed

tensor([ 0.4850,  1.7470,  0.0715, -1.5494,  0.4676],
       grad_fn=<EmbeddingBackward0>)

In [23]:
from torch.utils.data import DataLoader
from functools import partial


In [26]:
def _custom_collate_fn(batch, window_size, word_to_ix):
  # Prepare the datapoints
  x, y = zip(*batch)
  x = [pad_window(s, window_size=window_size) for s in x]
  x = [convert_token_to_indices(s, word_to_ix) for s in x]

  # Pad x so that all the examples in the batch have the same size
  pad_token_ix = word_to_ix["<pad>"]
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  # Pad y and record the length
  lengths = [len(label) for label in y]
  lenghts = torch.LongTensor(lengths)
  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  return x_padded, y_padded, lenghts

In [27]:
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0],
        [ 0,  0,  9,  7,  8, 18,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1],
        [0, 0, 0, 1]])
Batched Lengths:
tensor([4, 4])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([5, 6])

Iteration 2
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5])



In [29]:
print(f"Original Tensor: ")
print(batched_x)
print("")
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]])

Windows: 
tensor([[[ 0,  0, 22,  2,  6],
         [ 0, 22,  2,  6, 20],
         [22,  2,  6, 20, 15],
         [ 2,  6, 20, 15,  0],
         [ 6, 20, 15,  0,  0]]])


In [30]:
class WordWindowClassifier(nn.Module):

  def __init__(self, hyperparameters, vocab_size, pad_ix=0):
    super(WordWindowClassifier, self).__init__()
    self.window_size = hyperparameters["window_size"]
    self.embed_dim = hyperparameters["embed_dim"]
    self.hidden_dim = hyperparameters["hidden_dim"]
    self.freeze_embeddings = hyperparameters["freeze_embeddings"]


    self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
    if self.freeze_embeddings:
      self.embed_layer.weight.requires_grad = False

    full_window_size = 2 * window_size + 1
    self.hidden_layer = nn.Sequential(
      nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
      nn.Tanh()
    )

    self.output_layer = nn.Linear(self.hidden_dim, 1)
    self.probabilities = nn.Sigmoid()

  def forward(self, inputs):

    B, L = inputs.size()
    token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
    _, adjusted_length, _ = token_windows.size()
    assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)
    embedded_windows = self.embeds(token_windows)
    embedded_windows = embedded_windows.view(B, adjusted_length, -1)
    layer_1 = self.hidden_layer(embedded_windows)
    output = self.output_layer(layer_1)
    output = self.probabilities(output)
    output = output.view(B, -1)

    return output

In [32]:
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

#optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


def loss_function(batch_outputs, batch_labels, batch_lengths):
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())
    loss = loss / batch_lengths.sum().float()
    return loss

In [33]:
def train_epoch(loss_function, optimizer, model, loader):

  total_loss = 0
  for batch_inputs, batch_labels, batch_lengths in loader:
    optimizer.zero_grad()
    outputs = model.forward(batch_inputs)
    loss = loss_function(outputs, batch_labels, batch_lengths)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  return total_loss


def train(loss_function, optimizer, model, loader, num_epochs=10000):
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch % 100 == 0: print(epoch_loss)

In [34]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.28650815784931183
0.2639816850423813
0.18189989030361176
0.14815066754817963
0.12561392225325108
0.08766193315386772
0.07109018601477146
0.06557122431695461
0.05123418290168047
0.04794576857239008


Prediction

In [49]:
test_corpus = ["The capital of Vietnam is Hanoi"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0,0,1,0,1]]
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data,
                                           batch_size=1,
                                           shuffle=False,
                                           collate_fn=collate_fn)

In [50]:
for test_instance, labels, _ in test_loader:
  outputs = model.forward(test_instance)
  print(labels)
  print(outputs)

tensor([[0, 0, 0, 1, 0, 1]])
tensor([[0.0567, 0.0586, 0.0673, 0.4645, 0.0547, 0.6439]],
       grad_fn=<ViewBackward0>)
