# 1. Basic Code 

In [2]:
# Import Class Tokenizer to separating text with frequency
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
t1 = ["You are my heart",
      "You are my life",
      "I love u so much"]

In [4]:
# Init Object 
# 1 is Out of Vocabulary, sort by freq and where it places in text
tokenizer = Tokenizer(oov_token = "<OOV>")

# Fit text to class get vocab
tokenizer.fit_on_texts(t1)

# Return dict index word
word_index = tokenizer.word_index
# All word convert to lower
word_index

{'<OOV>': 1,
 'are': 3,
 'heart': 5,
 'i': 7,
 'life': 6,
 'love': 8,
 'much': 11,
 'my': 4,
 'so': 10,
 'u': 9,
 'you': 2}

In [5]:
# Test with other seq
t2 = ["i hate you",
    "i love u but can't touch your heart",
    "i am sorry"
]

# Fit to vocab 
t2 = tokenizer.texts_to_sequences(t2)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'are': 3,
 'heart': 5,
 'i': 7,
 'life': 6,
 'love': 8,
 'much': 11,
 'my': 4,
 'so': 10,
 'u': 9,
 'you': 2}

In [11]:
# U can see, each matrix has diff size, so we need padding (same as CNN) to easy calculate
print(t2)

# U can add padding post, default is first 
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded = pad_sequences(t2, padding = "post")
print(padded)

[[7, 1, 2], [7, 8, 9, 1, 1, 1, 1, 5], [7, 1, 1]]
[[7 1 2 0 0 0 0 0]
 [7 8 9 1 1 1 1 5]
 [7 1 1 0 0 0 0 0]]


# 2. Real Case

In [12]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2022-03-06 16:02:12--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.157.128, 142.251.8.128, 64.233.188.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.157.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2022-03-06 16:02:13 (147 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [14]:
# Read Json file 
import json
with open("/tmp/sarcasm.json") as f:
    data = json.load(f)

data[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [23]:
sentences = [item['headline'] for item in data]
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

In [22]:
padded = pad_sequences(sequences, padding = "post")
padded

array([[  308, 15115,   679, ...,     0,     0,     0],
       [    4,  8435,  3338, ...,     0,     0,     0],
       [  145,   838,     2, ...,     0,     0,     0],
       ...,
       [10735,     9,    68, ...,     0,     0,     0],
       [ 1541,   392,  4164, ...,     0,     0,     0],
       [29656,  1647,     6, ...,     0,     0,     0]], dtype=int32)