  # Sarcasm Recognition
  
  #### Class: CS660
  #### Student: 
  * Shuhui Wu
  * Shuqi Gao

In [2]:
import numpy as np
import pandas as pd
import random

from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from keras.models import Model, load_model
from keras.layers import Dense
from keras.layers import Flatten, Dropout, Reshape
from keras.layers import Input, Embedding, Concatenate
from keras.layers import Conv2D, MaxPool2D
from keras import regularizers
from keras.initializers import Constant

import matplotlib.pyplot as plt

### Dataset Prepocessing
* Tokenize and embed docs using pretrained Glove embedding vector

In [7]:
def glove_dict(path):
    embedding_dict = dict()
    with open(path, 'r') as f:
        for line in f:
            glove_vecs = line.split()
            key = glove_vecs[0]
            coefs = np.asarray(glove_vecs[1:], dtype='float32')
            embedding_dict[key] = coefs
    return embedding_dict


def preprocessing(texts, labels, tknzr, maxlen):
    
    enc_text = tknzr.texts_to_sequences(texts)
    Xs = pad_sequences(enc_text, maxlen=maxlen, padding='post')
        
    return Xs, labels

# preprocessing('train-balanced-sarcasm.csv')

### Data Generator
* training and testing data is too big to fit in the RAM

In [6]:
def data_generator(cols, batch_size, embedding_dict, tknzr, maxlen):
    while True:
        ind = 0
        cols = cols.sample(frac=1).reset_index(drop=True)
        texts = cols['comment'].astype(str)
        labels = cols['label'].astype(int)
        
        while ind < texts.size:
            if ind + batch_size < texts.size:
                batch_texts = texts[ind: ind+batch_size+1]
                batch_labels = labels[ind: ind+batch_size+1]
            else:
                batch_texts = texts[ind: texts.size+1]
                batch_labels = labels[ind: texts.size+1]

            Xs, ys = preprocessing(batch_texts, batch_labels, tknzr, maxlen)
            ind += batch_size
            yield Xs, ys
# print(data_generator('train-balanced-sarcasm.csv', 50))

### Define Model

In [4]:
def build_model(weights, maxlen, vocab_size):
    input_layer = Input(shape=(maxlen,), dtype='int32')
    e_layer = Embedding(vocab_size, 100, embeddings_initializer=Constant(weights), input_length=maxlen, trainable=True)(input_layer)
    reshape = Reshape(((maxlen, 100, 1)))(e_layer)
    conv_1 = Conv2D(100, kernel_size=(5, 100), activation='relu', kernel_initializer='normal', kernel_regularizer=regularizers.l2(3))(reshape)
    conv_2 = Conv2D(100, kernel_size=(6, 100), activation='relu', kernel_initializer='normal', kernel_regularizer=regularizers.l2(3))(reshape)
    conv_3 = Conv2D(100, kernel_size=(7, 100), activation='relu', kernel_initializer='normal', kernel_regularizer=regularizers.l2(3))(reshape)

    maxpool_1 = MaxPool2D(pool_size=(maxlen-5+1, 1), padding='valid')(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen-6+1, 1), padding='valid')(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen-7+1, 1), padding='valid')(conv_3)

    conca_tensor = Concatenate(axis=1)([maxpool_1, maxpool_2, maxpool_3])
    flatten = Flatten()(conca_tensor)
    dropout_2 = Dropout(0.5)(flatten)
    output = Dense(1, activation='sigmoid')(dropout_2)

    model = Model(input_layer, output)
    
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    print(model.summary())
    
    return model

### Training

In [None]:
def train():
    dataset = pd.read_csv('train-balanced-sarcasm.csv')
    cols = dataset[['comment', 'label']]
    
    batch_size = 50
    maxlen = len(max(cols['comment'].astype(str), key=len))
    tknzr = Tokenizer()
    tknzr.fit_on_texts(cols['comment'].astype(str))
    vocab_size = len(tknzr.word_index) + 1
    
    embedding_dict = glove_dict('glove.6B.100d.txt')
    weights = np.zeros((vocab_size, 100))
    for word, i in tknzr.word_index.items():
        vec = embedding_dict.get(word)
        if vec is not None:
            weights[i] = vec
        else:
            weights[i] = np.random.randn(100)
    
    model = build_model(weights, maxlen, vocab_size)
    history = model.fit_generator(data_generator(cols[:800000], batch_size, embedding_dict, tknzr, maxlen), 
                        steps_per_epoch= 800000 // batch_size,
                        epochs=10,
                        validation_data=data_generator(cols[800000:900000], batch_size, embedding_dict, tknzr, maxlen),
                        validation_steps=100000 // batch_size
                       )
    acc = model.evaluate_generator(data_generator(cols[900000:], batch_size, embedding_dict, tknzr, maxlen),
                                   steps=len(cols[900000:]) // batch_size)
    print(acc)
    return model, history
        
        
model, history = train()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10000, 100)   18058400    input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 10000, 100, 1 0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 9996, 1, 100) 50100       reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (

In [4]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

model.save('CNN.h5')

NameError: name 'history' is not defined

In [None]:
model = load_model('CNN.h5')
dataset = pd.read_csv('train-balanced-sarcasm.csv')
cols = dataset[['comment', 'label']]

batch_size = 50
maxlen = len(max(cols['comment'].astype(str), key=len))
tknzr = Tokenizer()
tknzr.fit_on_texts(cols['comment'].astype(str))
vocab_size = len(tknzr.word_index) + 1

embedding_dict = glove_dict('glove.6B.100d.txt')
weights = np.zeros((vocab_size, 100))
for word, i in tknzr.word_index.items():
    vec = embedding_dict.get(word)
    if vec is not None:
        weights[i] = vec
    else:
        weights[i] = np.random.randn(100)

history = model.fit_generator(data_generator(cols[:800000], batch_size, embedding_dict, tknzr, maxlen), 
                        steps_per_epoch= 800000 // batch_size,
                        epochs=10,
                        validation_data=data_generator(cols[800000:900000], batch_size, embedding_dict, tknzr, maxlen),
                        validation_steps=100000 // batch_size
                       )
acc = model.evaluate_generator(data_generator(cols[900000:], batch_size, embedding_dict, tknzr, maxlen),
                                steps=len(cols[900000:]) // batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
  643/16000 [>.............................] - ETA: 53:47 - loss: 0.6690 - acc: 0.6744