In [115]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import json

In [100]:
with open('../data/sacarsm.json') as f:
    datastore = json.load(f)

sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

sentences

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages",
 'airline passengers tackle man who rushes cockpit in bomb threat',
 'facebook reportedly working on healthcare features and apps',
 "north korea praises trump and urges us voters to reject 'dull hillary'",
 "actually, cnn's jeffrey lord has been 'indefensible' for a while",
 'barcelona holds huge protest in su

In [120]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [121]:
tokenizer.word_index

{'to': 1,
 'of': 2,
 'the': 3,
 'in': 4,
 'for': 5,
 'a': 6,
 'on': 7,
 'and': 8,
 'with': 9,
 'is': 10,
 'new': 11,
 'trump': 12,
 'man': 13,
 'from': 14,
 'at': 15,
 'about': 16,
 'you': 17,
 'this': 18,
 'by': 19,
 'after': 20,
 'up': 21,
 'out': 22,
 'be': 23,
 'how': 24,
 'as': 25,
 'it': 26,
 'that': 27,
 'not': 28,
 'are': 29,
 'your': 30,
 'his': 31,
 'what': 32,
 'he': 33,
 'all': 34,
 'just': 35,
 'who': 36,
 'has': 37,
 'will': 38,
 'more': 39,
 'one': 40,
 'into': 41,
 'report': 42,
 'year': 43,
 'why': 44,
 'have': 45,
 'area': 46,
 'over': 47,
 'donald': 48,
 'u': 49,
 'day': 50,
 'says': 51,
 's': 52,
 'can': 53,
 'first': 54,
 'woman': 55,
 'time': 56,
 'like': 57,
 'her': 58,
 "trump's": 59,
 'old': 60,
 'no': 61,
 'get': 62,
 'off': 63,
 'an': 64,
 'life': 65,
 'people': 66,
 'obama': 67,
 'now': 68,
 'house': 69,
 'still': 70,
 "'": 71,
 'women': 72,
 'make': 73,
 'was': 74,
 'than': 75,
 'white': 76,
 'back': 77,
 'my': 78,
 'i': 79,
 'clinton': 80,
 'down': 81,
 'i

In [122]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[307, 15114, 678, 3336, 2297, 47, 381, 2575, 15115, 5, 2576, 8433],
 [3, 8434, 3337, 2745, 21, 1, 165, 8435, 415, 3111, 5, 257, 8, 1001],
 [144, 837, 1, 906, 1748, 2092, 581, 4718, 220, 142, 38, 45, 1, 10735],
 [1484, 35, 223, 399, 1, 1831, 28, 318, 21, 9, 2923, 1392, 6968, 967],
 [766, 718, 4719, 907, 10736, 622, 593, 4, 3, 94, 1308, 91],
 [10737, 3, 364, 72],
 [3, 6969, 350, 5, 460, 4273, 2194, 1485],
 [18, 478, 38, 1167, 30, 154, 1, 98, 82, 17, 157, 5, 31, 351],
 [248, 3622, 6970, 554, 5273, 1994, 140],
 [2093, 325, 346, 400, 59, 15116, 5, 3, 3895],
 [2924, 1679, 4720, 13, 36, 4274, 6971, 4, 2094, 1102],
 [285, 781, 461, 7, 1555, 1910, 8, 3623],
 [233, 513, 2925, 12, 8, 928, 225, 368, 1, 4275, 15117, 8436],
 [237, 3896, 8437, 3338, 37, 234, 15118, 5, 6, 172],
 [15119, 1393, 664, 650, 4, 326, 2, 1030],
 [533, 2094, 10738, 122, 10739, 5, 10740, 4721, 1911],
 [2577,
  1394,
  382,
  44,
  3897,
  347,
  318,
  1031,
  1,
  23,
  15120,
  19,
  1103,
  386,
  102,
  1309],
 [1680, 8438

In [119]:
padded = pad_sequences(sequences, value=0, padding='post', truncating='post')
padded.shape

(26709, 40)

In [117]:
tokenizer.sequences_to_texts(padded.tolist())

["former versace store clerk sues over secret 'black code' for minority shoppers <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>",
 "the 'roseanne' revival catches up to our thorny political mood for better and worse <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>",
 "mom starting to fear son's web series closest thing she will have to grandchild <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>",
 'boehner just wants wife to listen not come up with alternative debt reduction ideas <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>',
 'j k rowling wishes 

['<OOV> <OOV> me']

In [135]:

sentences = ['baby i like this',
             'just  me  '
            ]

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1, 'baby': 2, 'i': 3, 'like': 4, 'this': 5, 'just': 6, 'me': 7}

In [136]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

[2 3 4 5]
(2, 4)


In [138]:
' sdjf skdjf lsdjf   s dsd f'.strip().split(' ')

['sdjf', 'skdjf', 'lsdjf', '', '', 's', 'dsd', 'f']