In [1]:

"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/spoken_language_understanding/atis/data')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install tf-nightly-2.0-preview



In [0]:
from pathlib import Path
from collections import Counter

import numpy as np

In [0]:
"""
Make Vocabulary (Words, Intents and Slots)
"""

Path('../vocab').mkdir(exist_ok=True)

counter_word = Counter()
counter_intent = Counter()
counter_slot = Counter()

with open('../data/atis.train.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    words = ['<digit>' if str.isdigit(w) else w for w in words]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    
    counter_word.update(words)
    counter_intent.update([intent])
    counter_slot.update(slots)

most_common = lambda x: [w for w, freq in x.most_common()]

words = ['<pad>'] + most_common(counter_word)
intents  = most_common(counter_intent)
slots  = most_common(counter_slot)

for vocab_li, path in zip(
    [words, intents, slots],
    ['../vocab/word.txt', '../vocab/intent.txt', '../vocab/slot.txt']):
  with open(path, 'w') as f:
    for w in vocab_li:
      f.write(w+'\n')

In [5]:
"""
Make Pretrained Embedding
"""
word2idx = {}
with open('../vocab/word.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip()
    word2idx[line] = i
    
embedding = np.zeros((len(word2idx)+1, 300)) # + 1 for unknown word

with open('../data/glove.840B.300d.txt') as f:
  count = 0
  for i, line in enumerate(f):
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in word2idx:
      count += 1
      embedding[word2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] words have found pre-trained values"%(count, len(word2idx)))
np.save('../vocab/word.npy', embedding)
print('Saved ../vocab/word.npy')

- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
[725 / 749] words have found pre-trained values
Saved ../vocab/word.npy
