<a href="https://colab.research.google.com/github/zuzannazak/MgrSarcasm/blob/master/mgr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import tensorflow as tf
import requests
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv

#Sources:
#  https://towardsdatascience.com/tensorflow-sarcasm-detection-in-20-mins-b549311b9e91

In [5]:
sentences = []
labels = []

with open('/content/drive/MyDrive/STUDIA/train-balanced.csv', 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        labels.append(row[0])
        sentences.append(row[1])

In [6]:
print(pd.DataFrame({'sentence' : sentences[0:10], 'label':labels[0:10]}))

                                            sentence label
0                                   Dang dog, thanks     0
1  to summon the powers of the flying spaghetti m...     0
2       i did that 3rd last 1 by accident last night     0
3  He's insane, used him in DC, better than Blake...     0
4  Forgot about him, he's a pretty pointless card...     0
5                                                hey     0
6                                              yeah?     0
7                                               okay     0
8                                      Condensation?     0
9                                What type of juice?     0


In [7]:
# Splitting the dataset into Train and Test
training_size = round(len(sentences) * .75)
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [8]:
# Setting tokenizer properties
vocab_size = 50000
oov_tok = "<oov>"

In [9]:
# Fit the tokenizer on Training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [10]:
# Setting the padding properties
max_length = 200
trunc_type='post'
padding_type='post'

In [11]:
# Creating padded sequences from train and test data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [12]:
# Setting the model parameters
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 16)           800000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 800,433
Trainable params: 800,433
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Converting the lists to numpy arrays for Tensorflow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels, dtype='int32')
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels, dtype='int32')

print(training_padded.dtype)

int32


In [15]:
# # Training the model
num_epochs = 40
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)




Epoch 1/40
26384/26384 - 230s - loss: 0.4798 - accuracy: 0.7664 - val_loss: 0.6135 - val_accuracy: 0.6956
Epoch 2/40
26384/26384 - 232s - loss: 0.4757 - accuracy: 0.7686 - val_loss: 0.6143 - val_accuracy: 0.6984
Epoch 3/40
26384/26384 - 235s - loss: 0.4717 - accuracy: 0.7715 - val_loss: 0.6148 - val_accuracy: 0.6933
Epoch 4/40
26384/26384 - 232s - loss: 0.4678 - accuracy: 0.7736 - val_loss: 0.6314 - val_accuracy: 0.6858
Epoch 5/40
26384/26384 - 228s - loss: 0.4637 - accuracy: 0.7762 - val_loss: 0.6209 - val_accuracy: 0.6944
Epoch 6/40
26384/26384 - 224s - loss: 0.4597 - accuracy: 0.7786 - val_loss: 0.6293 - val_accuracy: 0.6953
Epoch 7/40
26384/26384 - 225s - loss: 0.4557 - accuracy: 0.7810 - val_loss: 0.6296 - val_accuracy: 0.6932
Epoch 8/40
26384/26384 - 226s - loss: 0.4520 - accuracy: 0.7834 - val_loss: 0.6559 - val_accuracy: 0.6852
Epoch 9/40
26384/26384 - 230s - loss: 0.4483 - accuracy: 0.7856 - val_loss: 0.6390 - val_accuracy: 0.6929
Epoch 10/40
26384/26384 - 237s - loss: 0.4447 

In [None]:
sentence = ["Yes, definitely",
            "I like cats."]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))