<a href="https://colab.research.google.com/github/zuzannazak/MgrSarcasm/blob/master/mgr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import tensorflow as tf
import requests
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv

#Sources:
#  https://towardsdatascience.com/tensorflow-sarcasm-detection-in-20-mins-b549311b9e91

In [2]:
sentences = []
labels = []

with open('/content/drive/MyDrive/STUDIA/train-balanced.csv', 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        labels.append(row[0])
        sentences.append(row[1])

In [3]:
print(pd.DataFrame({'sentence' : sentences[0:10], 'label':labels[0:10]}))

                                            sentence label
0                                   Dang dog, thanks     0
1  to summon the powers of the flying spaghetti m...     0
2       i did that 3rd last 1 by accident last night     0
3  He's insane, used him in DC, better than Blake...     0
4  Forgot about him, he's a pretty pointless card...     0
5                                                hey     0
6                                              yeah?     0
7                                               okay     0
8                                      Condensation?     0
9                                What type of juice?     0


In [5]:
# Splitting the dataset into Train and Test
training_size = round(len(sentences) * .75)
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [6]:
# Setting tokenizer properties
vocab_size = 50000
oov_tok = "<oov>"

In [7]:
# Fit the tokenizer on Training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [8]:
# Setting the padding properties
max_length = 200
trunc_type='post'
padding_type='post'

In [9]:
# Creating padded sequences from train and test data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
# Setting the model parameters
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 16)           800000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 800,433
Trainable params: 800,433
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Converting the lists to numpy arrays for Tensorflow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels, dtype='int32')
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels, dtype='int32')

print(training_padded.dtype)

int32


In [12]:
# # Training the model
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)




Epoch 1/10
26384/26384 - 223s - loss: 0.6059 - accuracy: 0.6682 - val_loss: 0.5746 - val_accuracy: 0.6957
Epoch 2/10
26384/26384 - 228s - loss: 0.5734 - accuracy: 0.7004 - val_loss: 0.5769 - val_accuracy: 0.6947
Epoch 3/10
26384/26384 - 227s - loss: 0.5604 - accuracy: 0.7109 - val_loss: 0.5605 - val_accuracy: 0.7085
Epoch 4/10
26384/26384 - 218s - loss: 0.5518 - accuracy: 0.7172 - val_loss: 0.5664 - val_accuracy: 0.7054
Epoch 5/10
26384/26384 - 229s - loss: 0.5448 - accuracy: 0.7225 - val_loss: 0.5687 - val_accuracy: 0.7039
Epoch 6/10
26384/26384 - 240s - loss: 0.5392 - accuracy: 0.7262 - val_loss: 0.5630 - val_accuracy: 0.7100
Epoch 7/10
26384/26384 - 239s - loss: 0.5342 - accuracy: 0.7299 - val_loss: 0.5650 - val_accuracy: 0.7086
Epoch 8/10
26384/26384 - 234s - loss: 0.5298 - accuracy: 0.7329 - val_loss: 0.5736 - val_accuracy: 0.7029
Epoch 9/10
26384/26384 - 220s - loss: 0.5259 - accuracy: 0.7353 - val_loss: 0.5731 - val_accuracy: 0.7077
Epoch 10/10
26384/26384 - 225s - loss: 0.5220 

In [None]:
sentence = ["Yes, definitely",
            "I like cats."]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))