# Tokenization

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences # permet de mettre toutes les sequences a la meme taille

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
sentences = [
             'I love my dog',
             'I love my cat',
             'You love my dog',
             'Do you think my dog is amazing?'
]

In [5]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV") #argument num_word indique qu'on ne veut selectionner que les 100 mots les plus frequents
#l'arugment oov dit que si le tokenizer rencontre un mot qu'il ne connait pas, il va l'indiquer comme oov 

In [6]:
tokenizer.fit_on_texts(sentences)

In [7]:
word_index = tokenizer.word_index

In [8]:
print(word_index)

{'<OOV': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


# Turning sentences into data

In [9]:
sequences = tokenizer.texts_to_sequences(sentences)

In [10]:
padded = pad_sequences(sequences)

la ligne de code si dessous : parametre padding post va mettre les 0 a la fin plutot qu'au debut, maxlen : les sequences sont réduit a une taille max de 5, et donc truncating = post, les valeurs qui sont a la fin de la sequences sont supprimés si ils depassent 

In [11]:
#padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)

In [12]:
print(sequences)
print(padded)

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [13]:
test_data = [
             'i really love my dog',
             'my dog love manatee'
]

In [14]:
test_seq=tokenizer.texts_to_sequences(test_data)

In [15]:
print(test_seq)
print(word_index)

[[5, 1, 3, 2, 4], [2, 4, 3, 1]]
{'<OOV': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


# recognize sentiment in text

In [16]:
import json #bibliotheque permettant de lire les fichier au format json

In [17]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2021-03-29 09:56:16--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2021-03-29 09:56:16 (106 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



lecture des fichiers

In [18]:
with open("/tmp/sarcasm.json", 'r') as f:
  datastore = json.load(f)

In [19]:
datastore

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

on stocke les fichier dans des listes

In [20]:
sentences  =[]
labels = []
urls = []

for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])


In [21]:
print(labels)
print(sentences)

[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 

In [24]:
training_size = 20000

In [25]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [26]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)

In [27]:
word_index = tokenizer.word_index

In [None]:
word_index

In [29]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)

In [32]:
print(training_sequences)

[[328, 1, 799, 3405, 2404, 47, 389, 2214, 1, 6, 2614, 8863], [4, 6840, 3096, 3097, 23, 2, 161, 1, 390, 2842, 6, 251, 9, 889], [153, 890, 2, 891, 1445, 2215, 595, 5650, 221, 133, 36, 45, 2, 8864], [1252, 38, 213, 382, 2, 1572, 29, 288, 23, 10, 2405, 1446, 5651, 958], [715, 672, 5652, 1043, 8865, 662, 553, 5, 4, 92, 1253, 90], [8866, 4, 366, 70], [4, 6841, 369, 6, 498, 3406, 1875, 1378], [20, 563, 36, 1091, 31, 163, 2, 103, 87, 18, 150, 6, 33, 343], [278, 3407, 6842, 447, 8867, 2092, 148], [2093, 300, 335, 370, 63, 1, 6, 4, 4264], [3098, 2216, 3756, 14, 37, 5653, 8868, 5, 2094, 1092], [309, 767, 428, 8, 1663, 1664, 9, 3099], [226, 477, 2843, 13, 9, 922, 239, 371, 2, 4265, 1, 6843], [235, 5654, 8869, 3757, 39, 240, 1, 6, 7, 174], [1, 1379, 800, 663, 5, 336, 3, 959], [524, 2094, 8870, 126, 8871, 6, 8872, 3758, 1665], [2095, 1328, 341, 46, 3408, 323, 288, 960, 2, 22, 1, 19, 1044, 359, 109, 1447], [1666, 6844, 3100, 8873, 19, 5655, 1200], [8874, 822, 2, 1768, 251, 1201, 38, 211, 2406], [4820

In [31]:
max_length = 100
trunc_type='post'
padding_type='post'


In [None]:
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D

In [None]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
model = Sequential()
model.add(Embedding(10000,16,input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
model.fit(training_padded,training_labels, epochs=20, validation_data=(testing_padded, testing_labels))

In [None]:
sentence = [
            "its good to be as bad",
            "the weather is bright and sunny"
]

sequences = tokenizer.texts_to_sequences(sentence)
print(sequences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(padded)

In [None]:
print(model.predict(padded))