In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
  'I love my dog',
  'I love my cat',
  'You love my sample',
  'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'i': 3, 'dog': 4, 'you': 5, 'cat': 6, 'sample': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [16]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[3, 2, 1, 4], [3, 2, 1, 6], [5, 2, 1, 7], [8, 5, 9, 1, 4, 10, 11]]


In [17]:
test_data = [
  'i really love my dog',
  'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[3, 2, 1, 4], [1, 4, 1]]


In [18]:
tokenizer = Tokenizer(num_words = 100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'you': 6, 'cat': 7, 'sample': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}


In [19]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)
test_data = [
  'i really love my dog',
  'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 3, 2, 5], [4, 3, 2, 7], [6, 3, 2, 8], [9, 6, 10, 2, 5, 11, 12]]
[[4, 1, 3, 2, 5], [2, 5, 1, 2, 1]]


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
  'I love my dog',
  'I love my cat',
  'You love my sample',
  'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

padded = pad_sequences(sequences,padding='post')
print(padded)

{'my': 1, 'love': 2, 'i': 3, 'dog': 4, 'you': 5, 'cat': 6, 'sample': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[3, 2, 1, 4], [3, 2, 1, 6], [5, 2, 1, 7], [8, 5, 9, 1, 4, 10, 11]]
[[ 0  0  0  3  2  1  4]
 [ 0  0  0  3  2  1  6]
 [ 0  0  0  5  2  1  7]
 [ 8  5  9  1  4 10 11]]


In [21]:
padded = pad_sequences(sequences,padding='post')
print(padded)

[[ 3  2  1  4  0  0  0]
 [ 3  2  1  6  0  0  0]
 [ 5  2  1  7  0  0  0]
 [ 8  5  9  1  4 10 11]]


In [22]:
padded = pad_sequences(sequences,padding='post',truncating='post')
print(padded)

[[ 3  2  1  4  0  0  0]
 [ 3  2  1  6  0  0  0]
 [ 5  2  1  7  0  0  0]
 [ 8  5  9  1  4 10 11]]


In [23]:
padded = pad_sequences(sequences,padding='post',truncating='post',maxlen=5)
print(padded)

[[3 2 1 4 0]
 [3 2 1 6 0]
 [5 2 1 7 0]
 [8 5 9 1 4]]


In [26]:
import json

with open('/content/Sarcasm_Headlines_Dataset.json','r') as f:
  datastore = json.load(f)

sentences = []
labels = []
urls =[]
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [27]:
sentences

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages",
 'airline passengers tackle man who rushes cockpit in bomb threat',
 'facebook reportedly working on healthcare features and apps',
 "north korea praises trump and urges us voters to reject 'dull hillary'"]

In [29]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)

print(word_index)
print(sequences)
print(sequences)
print(padded)

{'<OOV>': 1, 'to': 2, 'for': 3, 'the': 4, 'and': 5, 'up': 6, 'will': 7, 'in': 8, 'former': 9, 'versace': 10, 'store': 11, 'clerk': 12, 'sues': 13, 'over': 14, 'secret': 15, "'black": 16, "code'": 17, 'minority': 18, 'shoppers': 19, "'roseanne'": 20, 'revival': 21, 'catches': 22, 'our': 23, 'thorny': 24, 'political': 25, 'mood': 26, 'better': 27, 'worse': 28, 'mom': 29, 'starting': 30, 'fear': 31, "son's": 32, 'web': 33, 'series': 34, 'closest': 35, 'thing': 36, 'she': 37, 'have': 38, 'grandchild': 39, 'boehner': 40, 'just': 41, 'wants': 42, 'wife': 43, 'listen': 44, 'not': 45, 'come': 46, 'with': 47, 'alternative': 48, 'debt': 49, 'reduction': 50, 'ideas': 51, 'j': 52, 'k': 53, 'rowling': 54, 'wishes': 55, 'snape': 56, 'happy': 57, 'birthday': 58, 'most': 59, 'magical': 60, 'way': 61, 'advancing': 62, "world's": 63, 'women': 64, 'fascinating': 65, 'case': 66, 'eating': 67, 'lab': 68, 'grown': 69, 'meat': 70, 'this': 71, 'ceo': 72, 'send': 73, 'your': 74, 'kids': 75, 'school': 76, 'if':

AttributeError: ignored