In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

# Load the Data

In [3]:
data = pd.read_csv('train.En.csv')
valid = pd.read_csv('task_A_En_test.csv')

In [4]:
data = data[['tweet', 'sarcastic']]
data.rename(columns={'tweet': 'text'}, inplace=True)
data['text'] = data['text'].astype('string')
data['sarcastic'] = data['sarcastic'].astype('int')
data.dropna(inplace=True)
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


# CREATE MODEL

In [5]:
import gensim
from keras.preprocessing import text
from keras.utils import pad_sequences

KeyboardInterrupt: 

In [None]:
EMBEDDING_DIM = 200
class Tokenizer:
  def __init__(self, sentences):
    words = self.get_words(sentences)
    self.tokenizer = text.Tokenizer()
    self.tokenizer.fit_on_texts(words)
    self.w2v_model = gensim.models.Word2Vec(sentences=words, vector_size=EMBEDDING_DIM, window=5, min_count=1)
  def get_words(self, sentences):
    return [s.split() for s in sentences]
  def tokenize(self, sentences):
    return pad_sequences(self.tokenizer.texts_to_sequences(self.get_words(sentences)), maxlen = 20)

In [None]:
EMBEDDING_DIM = 200

In [None]:
# Number of words
len(w2v_model.wv.key_to_index)

15801

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(words)
tokenized_train = tokenizer.texts_to_sequences(words)
x = pad_sequences(tokenized_train, maxlen = 20)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

14261

In [None]:
# Function to create weight matrix from word2vec gensim model
def get_weight_matrix(model, vocab):
  # total vocabulary size plus 0 for unknown words
  vocab_size = len(vocab) + 1
  # define weight matrix dimensions with all 0
  weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
  # step vocab, store vectors using the Tokenizer's integer mapping
  for word, i in vocab.items():
    weight_matrix[i] = model[word] if word in model else 0
  return weight_matrix

In [None]:
embedding_vectors = get_weight_matrix(w2v_model.wv, tokenizer.word_index)

# TRAIN

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU
from keras.optimizers import Adam

In [None]:
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=20, trainable=True))
#LSTM 
model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.3 , dropout = 0.3,return_sequences = True)))
model.add(Bidirectional(GRU(units=32 , recurrent_dropout = 0.1 , dropout = 0.1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate = 0.01), loss='binary_crossentropy', metrics=['acc'])

del embedding_vectors

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 200)           2852200   
                                                                 
 bidirectional (Bidirectiona  (None, 20, 256)          336896    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               55680     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 3,244,841
Trainable params: 3,244,841
Non-trainable params: 0
_________________________________________________________________


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, data['sarcastic'], test_size = 0.3)

In [None]:
history = model.fit(x_train, y_train, batch_size = 128 , validation_data = (x_test,y_test) , epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train)[1]*100)
print("Accuracy of the model on Testing Data is - " , model.evaluate(x_test,y_test)[1]*100)

Accuracy of the model on Training Data is -  99.79389905929565
Accuracy of the model on Testing Data is -  66.57060384750366


# Evaluate

In [None]:
from sklearn.metrics import f1_score

## Testing Randomness

In [None]:
f1_score(np.random.randint(0, 2, valid['sarcastic'].shape), valid['sarcastic'])

0.20417124039517015

## Testing our model

In [None]:
tokenized_train = tokenizer.texts_to_sequences([s.split() for s in valid['text']])
x_valid = pad_sequences(tokenized_train, maxlen = 20)

In [None]:
f1_score(model.predict(x_valid).round(), valid['sarcastic'])



0.19841269841269843