# Code preparation

In [None]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


## Clone the github

In [None]:
!git clone https://github.com/Melusinee/SarcasmDetection.git

Cloning into 'SarcasmDetection'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 837 (delta 37), reused 7 (delta 3), pack-reused 772[K
Receiving objects: 100% (837/837), 11.09 MiB | 3.86 MiB/s, done.
Resolving deltas: 100% (492/492), done.


# Download the trained Model

In [None]:
links = [
   '1O15FV0nL8biWRk1M2SpvV48rYXFS3SE_',
  '142x4-bdipVfcOcNcdGtuXRekFg5B1KCn',
  '1uhfRVLtz7K2IpHbYLaJkTBm25tYtHtbj',
  '12YUPLM4zKnlCiE9lKP14OcSVImgIxGus',
  '0B7C_0ZfEBcpRbEVnVDFmTXdJS0U'
]

names = [
    'weights.05__.hdf5',
    'vocab_list.txt',
    'model.json.hdf5',
    'model.json',
    'GoogleNews-vectors-negative300.bin'
]

import os
os.chdir("/content/SarcasmDetection/resource/text_model/weights")

# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [None]:
for i in range(5):
  id = links[i]
  name = names[i]
  downloaded = drive.CreateFile({'id':id }) 
  downloaded.GetContentFile(name) 

In [None]:
os.chdir("/content/SarcasmDetection/resource/test")

downloaded.GetContentFile("Sarcasm_input.txt") 

# Run the script

In [None]:
os.chdir("/content/SarcasmDetection/src")

## The helper function for data processing

### glove2Word2vecLoader

In [None]:
import numpy as np
import shutil
import hashlib
from sys import platform

import gensim


def prepend_line(infile, outfile, line):
    with open(infile, 'r') as old:
        with open(outfile, 'w') as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)


def prepend_slow(infile, outfile, line):
    with open(infile, 'r') as fin:
        with open(outfile, 'w') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)


def checksum(filename):
    BLOCKSIZE = 65536
    hasher = hashlib.md5()
    with open(filename, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)
    return hasher.hexdigest()


# Pre-computed glove files values.
pretrain_num_lines = {"glove.840B.300d.txt": 2196017, "glove.42B.300d.txt": 1917494}

pretrain_checksum = {
    "glove.6B.300d.txt": "b78f53fb56ec1ce9edc367d2e6186ba4",
    "glove.twitter.27B.50d.txt": "6e8369db39aa3ea5f7cf06c1f3745b06",
    "glove.42B.300d.txt": "01fcdb413b93691a7a26180525a12d6e",
    "glove.6B.50d.txt": "0fac3659c38a4c0e9432fe603de60b12",
    "glove.6B.100d.txt": "dd7f3ad906768166883176d69cc028de",
    "glove.twitter.27B.25d.txt": "f38598c6654cba5e6d0cef9bb833bdb1",
    "glove.6B.200d.txt": "49fa83e4a287c42c6921f296a458eb80",
    "glove.840B.300d.txt": "eec7d467bccfa914726b51aac484d43a",
    "glove.twitter.27B.100d.txt": "ccbdddec6b9610196dd2e187635fee63",
    "glove.twitter.27B.200d.txt": "e44cdc3e10806b5137055eeb08850569",
}


def check_num_lines_in_glove(filename, check_checksum=False):
    if check_checksum:
        assert checksum(filename) == pretrain_checksum[filename]
    if filename.startswith('glove.6B.'):
        return 400000
    elif filename.startswith('glove.twitter.27B.'):
        return 1193514
    else:
        return pretrain_num_lines[filename]


def load_glove_word2vec(filename):

    # load the whole embedding into memory
    embeddings_index = dict()
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))

    return embeddings_index


### Data Handler


In [None]:
import sys

sys.path.append('../')
from collections import defaultdict
import re
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.wrappers import FastText
import numpy
from nltk.tokenize import TweetTokenizer
# import src.data_processing.glove2Word2vecLoader as glove
import itertools


# loading the emoji dataset
def load_unicode_mapping(path):
    emoji_dict = defaultdict()
    with open(path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            emoji_dict[tokens[0]] = tokens[1]
    return emoji_dict


def load_word2vec(path=None):
    word2vecmodel = KeyedVectors.load_word2vec_format(path, binary=True)
    return word2vecmodel


def load_fasttext(path=None):
    word2vecmodel = FastText.load_fasttext_format(path)
    return word2vecmodel


def InitializeWords(word_file_path):
    word_dictionary = defaultdict()

    with open(word_file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.lower().strip().split('\t')
            word_dictionary[tokens[0]] = int(tokens[1])

    for alphabet in "bcdefghjklmnopqrstuvwxyz":
        if (alphabet in word_dictionary):
            word_dictionary.__delitem__(alphabet)

    for word in ['ann', 'assis',
                 'bz',
                 'ch', 'cre', 'ct',
                 'di',
                 'ed', 'ee',
                 'ic',
                 'le',
                 'ng', 'ns',
                 'pr', 'picon',
                 'th', 'tle', 'tl', 'tr',
                 'um',
                 've',
                 'yi'
                 ]:
        if (word in word_dictionary):
            word_dictionary.__delitem__(word)

    return word_dictionary


def normalize_word(word):
    temp = word
    while True:
        w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
        if (w == temp):
            break
        else:
            temp = w
    return w


def load_split_word(split_word_file_path):
    split_word_dictionary = defaultdict()
    with open(split_word_file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.lower().strip().split('\t')
            if (len(tokens) >= 2):
                split_word_dictionary[tokens[0]] = tokens[1]

    print('split entry found:', len(split_word_dictionary.keys()))
    return split_word_dictionary


def split_hashtags(term, wordlist, split_word_list, dump_file=''):
    # print('term::',term)

    if (len(term.strip()) == 1):
        return ['']

    if (split_word_list != None and term.lower() in split_word_list):
        # print('found')
        return split_word_list.get(term.lower()).split(' ')
    else:
        print(term)

    # discarding # if exists
    if (term.startswith('#')):
        term = term[1:]

    if (wordlist != None and term.lower() in wordlist):
        return [term.lower()]

    words = []
    # max freq
    penalty = -69971
    max_coverage = penalty

    split_words_count = 6
    # checking camel cases
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
    term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
    term = re.sub(r'([A-Z]{2,})+', r' \1', term)
    words = term.strip().split(' ')

    n_splits = 0

    if (len(words) < 3):
        # splitting lower case and uppercase words upto 5 words
        chars = [c for c in term.lower()]

        found_all_words = False

        while (n_splits < split_words_count and not found_all_words):
            for idx in itertools.combinations(range(0, len(chars)), n_splits):
                output = numpy.split(chars, idx)
                line = [''.join(o) for o in output]

                score = (1. / len(line)) * sum(
                    [wordlist.get(
                        word.strip()) if word.strip() in wordlist else 0. if word.strip().isnumeric() else penalty for
                     word in line])

                if (score > max_coverage):
                    words = line
                    max_coverage = score

                    line_is_valid_word = [word.strip() in wordlist if not word.isnumeric() else True for word in line]

                    if (all(line_is_valid_word)):
                        found_all_words = True

                    # uncomment to debug hashtag splitting
                    # print(line, score, line_is_valid_word)

            n_splits = n_splits + 1

    # removing hashtag sign
    words = [str(s) for s in words]

    # dumping splits for debug
#     with open(dump_file, 'a') as f:
#         if (term != '' and len(words) > 0):
#             f.write('#' + str(term).strip() + '\t' + ' '.join(words) + '\t' + str(n_splits) + '\n')

    return words


def load_abbreviation(path='/content/SarcasmDetection/resource/abbreviations.txt'):
    abbreviation_dict = defaultdict()
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            token = line.lower().strip().split('\t')
            abbreviation_dict[token[0]] = token[1]
    return abbreviation_dict


def filter_text(text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
                split_hashtag=False,
                ignore_profiles=False,
                replace_emoji=True):
    filtered_text = []

    filter_list = ['/', '-', '=', '+', '…', '\\', '(', ')', '&', ':', '|']

    for t in text:
        word_tokens = None

        # discarding symbols
        # if (str(t).lower() in filter_list):
        #     continue

        # ignoring profile information if ignore_profiles is set
        if (ignore_profiles and str(t).startswith("@")):
            continue

        # ignoring links
        if (str(t).startswith('http')):
            continue

        # ignoring sarcastic marker
#         uncomment the following line for Fracking sarcasm using neural network
        if (str(t).lower() in ['#sarcasm','#sarcastic', '#yeahright','#not']):
            continue

        # for onlinesarcasm
        # comment if you are running the code for Fracking sarcasm using neural network
#         if (str(t).lower() in ['#sarcasm']):
#             continue

        # replacing emoji with its unicode description
        if (replace_emoji):
            if (t in emoji_dict):
                t = emoji_dict.get(t).split('_')
                filtered_text.extend(t)
                continue

        # splitting hastags
        if (split_hashtag and str(t).startswith("#")):
            splits = split_hashtags(t, word_list, split_word_list, dump_file='')
            # adding the hashtags
            if (splits != None):
                filtered_text.extend([s for s in splits if (not filtered_text.__contains__(s))])
                continue

        # removes repeatation of letters
        if (normalize_text):
            t = normalize_word(t)

        # expands the abbreviation
        if (t in abbreviation_dict):
            tokens = abbreviation_dict.get(t).split(' ')
            filtered_text.extend(tokens)
            continue

        # appends the text
        filtered_text.append(t)

    return filtered_text


def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
              split_hashtag=False,
              ignore_profiles=False,
              lowercase=False, replace_emoji=True, n_grams=None, at_character=False):
    data = []
    neglected = 0
    for i, line in enumerate(lines):
        if (i % 100 == 0):
            print(str(i) + '...', end='', flush=True)

        try:

            # convert the line to lowercase
            if (lowercase):
                line = line.lower()

            # split into token
            token = line.split('\t')

            # ID
            id = token[0]

            # label
            label = int(token[1].strip())

            # tweet text
            target_text = TweetTokenizer().tokenize(token[2].strip())
            if (at_character):
                target_text = [c for c in token[2].strip()]

            if (n_grams != None):
                n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams))
                target_text.extend(['_'.join(n) for n in n_grams_list])

            # filter text
#             target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict,
#                                       normalize_text,
#                                       split_hashtag,
#                                       ignore_profiles, replace_emoji=replace_emoji)

            # awc dimensions
            dimensions = []
            if (len(token) > 3 and token[3].strip() != 'NA'):
                dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]

            # context tweet
            context = []
            if (len(token) > 4):
                if (token[4] != 'NA'):
                    context = TweetTokenizer().tokenize(token[4].strip())
#                     context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict,
#                                           normalize_text,
#                                           split_hashtag,
#                                           ignore_profiles, replace_emoji=replace_emoji)

            # author
            author = 'NA'
            if (len(token) > 5):
                author = token[5]

            if (len(target_text) != 0):
                # print((label, target_text, dimensions, context, author))
                data.append((id, label, target_text, dimensions, context, author))
            else:
              neglected+=1
        except:
            raise
    print('!!!!!!!!!  There are',neglected,'entries neglected  !!!!!!!!!!!!!!!!!!!!!!')
    return data


def load_resources(word_file_path, split_word_path, emoji_file_path, split_hashtag=False, replace_emoji=True):
    word_list = None
    emoji_dict = None

    # load split files
    split_word_list = load_split_word(split_word_path)

    # load word dictionary
    if (split_hashtag):
        word_list = InitializeWords(word_file_path)

    if (replace_emoji):
        emoji_dict = load_unicode_mapping(emoji_file_path)

    abbreviation_dict = load_abbreviation()

    return word_list, emoji_dict, split_word_list, abbreviation_dict


def loaddata(filename, word_file_path, split_word_path, emoji_file_path, normalize_text=False, split_hashtag=False,
             ignore_profiles=False,
             lowercase=True, replace_emoji=True, n_grams=None, at_character=False):

    word_list, emoji_dict, split_word_list, abbreviation_dict = load_resources(word_file_path, split_word_path,
                                                                               emoji_file_path,
                                                                               split_hashtag=split_hashtag,
                                                                               replace_emoji=replace_emoji)
    lines = open(filename, 'r').readlines()

    data = parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=normalize_text,
                     split_hashtag=split_hashtag,
                     ignore_profiles=ignore_profiles, lowercase=lowercase, replace_emoji=replace_emoji,
                     n_grams=n_grams, at_character=at_character)
    return data


def build_vocab(data, without_dimension=True, ignore_context=False, min_freq=0):
    vocab = defaultdict(int)
    vocab_freq = defaultdict(int)

    total_words = 1
    if (not without_dimension):
        for i in range(1, 101):
            vocab_freq[str(i)] = 0
            # vocab[str(i)] = total_words
            # total_words = total_words + 1

    for sentence_no, token in enumerate(data):
        for word in token[2]:
            if (word not in vocab_freq):
                # vocab[word] = total_words
                # total_words = total_words + 1
                vocab_freq[word] = 0
            vocab_freq[word] = vocab_freq.get(word) + 1

        if (not without_dimension):
            for word in token[3]:
                # if (word not in vocab_freq):
                #     vocab[word] = total_words
                #     total_words = total_words + 1
                vocab_freq[word] = vocab_freq.get(word) + 1

        if (ignore_context == False):
            for word in token[4]:
                if (not word in vocab):
                    # vocab[word] = total_words
                    # total_words = total_words + 1
                    vocab_freq[word] = 0
                vocab_freq[word] = vocab_freq.get(word) + 1

    for k, v in vocab_freq.items():
        if (v >= min_freq):
            vocab[k] = total_words
            total_words = total_words + 1

    return vocab


def build_reverse_vocab(vocab):
    rev_vocab = defaultdict(str)
    for k, v in vocab.items():
        rev_vocab[v] = k
    return rev_vocab


def build_auxiliary_feature(data):
    aux = []
    for id, label, line, dimensions, context, author in data:
        aux.append([float(line.count('!')), float(line.count('?')), float(line.count('.')),
                    sum([1.0 if c.isupper() else 0.0 for c in line]), float(line.count('"'))])

    return numpy.asarray(aux)


def vectorize_word_dimension(data, vocab, drop_dimension_index=None, verbose=False):
    X = []
    Y = []
    D = []
    C = []
    A = []

    known_words_set = set()
    unknown_words_set = set()

    tokens = 0
    token_coverage = 0

    for id, label, line, dimensions, context, author in data:
        vec = []
        context_vec = []
        if (len(dimensions) != 0):
            dvec = [vocab.get(d) for d in dimensions]
        else:
            dvec = [vocab.get('unk')] * 11

        if drop_dimension_index != None:
            dvec.pop(drop_dimension_index)

        # tweet
        for words in line:
            tokens = tokens + 1
            if (words in vocab):
                vec.append(vocab[words])
                token_coverage = token_coverage + 1
                known_words_set.add(words)
            else:
                vec.append(vocab['unk'])
                unknown_words_set.add(words)
        # context_tweet
        if (len(context) != 0):
            for words in line:
                tokens = tokens + 1
                if (words in vocab):
                    context_vec.append(vocab[words])
                    token_coverage = token_coverage + 1
                    known_words_set.add(words)
                else:
                    context_vec.append(vocab['unk'])
                    unknown_words_set.add(words)
        else:
            context_vec = [vocab['unk']]

        X.append(vec)
        Y.append(label)
        D.append(dvec)
        C.append(context_vec)
        A.append(author)

    if verbose:
        print('Token coverage:', token_coverage / float(tokens))
        print('Word coverage:', len(known_words_set) / float(len(vocab.keys())))

    return numpy.asarray(X), numpy.asarray(Y), numpy.asarray(D), numpy.asarray(C), numpy.asarray(A)


def pad_sequence_1d(sequences, maxlen=None, dtype='float32', padding='pre', truncating='pre', value=0.):
    X = [vectors for vectors in sequences]

    nb_samples = len(X)

    x = (numpy.zeros((nb_samples, maxlen)) * value).astype(dtype)

    for idx, s in enumerate(X):
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError("Truncating type '%s' not understood" % padding)

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError("Padding type '%s' not understood" % padding)

    return x


def write_vocab(filepath, vocab):
    with open(filepath, 'w') as fw:
        for key, value in vocab.items():
            fw.write(str(key) + '\t' + str(value) + '\n')


def get_fasttext_weight(vocab, n=300, path=None):
    word2vecmodel = load_word2vec(path=path)
    emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (word2vecmodel.__contains__(k)):
            emb_weights[v, :] = word2vecmodel[k][:n]

    return emb_weights


def get_word2vec_weight(vocab, n=300, path=None):
    word2vecmodel = load_word2vec(path=path)
    emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (word2vecmodel.__contains__(k)):
            emb_weights[v, :] = word2vecmodel[k][:n]

    return emb_weights


def load_glove_model(vocab, n=200, glove_path='/home/glove/glove.twitter.27B/glove.twitter.27B.200d.txt'):
    word2vecmodel = load_glove_word2vec(glove_path)

    embedding_matrix = numpy.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        embedding_vector = word2vecmodel.get(k)
        if embedding_vector is not None:
            embedding_matrix[v] = embedding_vector

    return embedding_matrix


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def prepare_fasttext(x_train, x_test, max_features=20000, ngram_range=2):
    if ngram_range > 1:
        print('Adding {}-gram features'.format(ngram_range))
        # Create set of unique n-gram from the training set.
        ngram_set = set()
        for input_list in x_train:
            for i in range(2, ngram_range + 1):
                set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                ngram_set.update(set_of_ngram)

        # Dictionary mapping n-gram token to a unique integer.
        # Integer values are greater than max_features in order
        # to avoid collision with existing features.
        start_index = max_features + 1
        token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
        indice_token = {token_indice[k]: k for k in token_indice}

        # max_features is the highest integer that could be found in the dataset.
        max_features = numpy.max(list(indice_token.keys())) + 1

        # Augmenting x_train and x_test with n-grams features
        x_train = add_ngram(x_train, token_indice, ngram_range)
        x_test = add_ngram(x_test, token_indice, ngram_range)
        print('Average train sequence length: {}'.format(numpy.mean(list(map(len, x_train)), dtype=int)))
        print('Average test sequence length: {}'.format(numpy.mean(list(map(len, x_test)), dtype=int)))

## The Main code for CNN_LSTM_DNN

In [None]:
intermediate_layer_model = None

In [None]:
# for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py

import os
import sys

sys.path.append('../')

import collections
import time
import numpy

numpy.random.seed(1337)
from sklearn import metrics
from keras.models import Model
from keras.layers import Input
from keras.models import Sequential, model_from_json
from keras.layers.core import Dropout, Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.utils import np_utils
from collections import defaultdict
# import src.data_processing.data_handler as dh


class sarcasm_model():
    _train_file = None
    _test_file = None
    _tweet_file = None
    _output_file = None
    _model_file_path = None
    _word_file_path = None
    _split_word_file_path = None
    _emoji_file_path = None
    _vocab_file_path = None
    _input_weight_file_path = None
    _vocab = None
    _line_maxlen = None

    def __init__(self):
        self._line_maxlen = 30

    def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=256, hidden_units=256):
        print('Build model...')

        text_input = Input(name='text', shape=(maxlen,))

        if (len(emb_weights) == 0):
            emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen,
                            embeddings_initializer='glorot_normal',
                            trainable=True)(text_input)
        else:
            emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
                            trainable=False)(text_input)

        cnn1 = Convolution1D(int(hidden_units / 4), 3, kernel_initializer='he_normal', activation='sigmoid',
                             padding='valid', input_shape=(1, maxlen))(emb)

        cnn2 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid',
                             padding='valid', input_shape=(1, maxlen - 1))(cnn1)

        lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
                     dropout=0.25, return_sequences=True)(cnn2)

        lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
                     dropout=0.25)(lstm1)

        dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(lstm2)
        
        dnn_2_without_softmax = Dense(2,name = 'before_softmax')(dnn_1)
        dnn_2 = Activation('softmax')(dnn_2_without_softmax)

        model = Model(inputs=[text_input], outputs=dnn_2)

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        print('No of parameter:', model.count_params())

        layer_name = 'before_softmax'
        global intermediate_layer_model 
        intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)
        
        print("intermediate_layer_model:",intermediate_layer_model)
        
        print(model.summary())
        return model

Using TensorFlow backend.


In [None]:
class train_model(sarcasm_model):
    train = None
    validation = None
    print("Loading resource...")

    def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
                 vocab_file,
                 output_file,
                 word2vec_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file

        self.load_train_validation_data()

        print(self._line_maxlen)

        # build vocabulary
        # truncates words with min freq=1
        self._vocab = build_vocab(self.train, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        write_vocab(self._vocab_file_path, self._vocab)

        # prepares input
        X, Y, D, C, A = vectorize_word_dimension(self.train, self._vocab)
        X = pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = vectorize_word_dimension(self.validation, self._vocab)
        tX = pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        dimension_size = 300

        W = get_word2vec_weight(self._vocab, n=dimension_size,
                                   path=word2vec_path)

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=256, emb_weights=W)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
        save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}.hdf5',
                                   save_best_only=False)
        early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)

        # training
        model.fit(X, Y, batch_size=64, epochs=1, validation_data=(tX, tY), shuffle=True,
                  callbacks=[save_best, save_all, early_stopping], class_weight=ratio, verbose=2)

    def load_train_validation_data(self):
        self.train = loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
                                 self._emoji_file_path, normalize_text=True,
                                 split_hashtag=True,
                                 ignore_profiles=False)
        print('Training data loading finished...')

        self.validation = loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
                                      self._emoji_file_path,
                                      normalize_text=True,
                                      split_hashtag=True,
                                      ignore_profiles=False)
        print('Validation data loading finished...')

        if (self._test_file != None):
            self.test = loaddata(self._test_file, self._word_file_path, normalize_text=True,
                                    split_hashtag=True,
                                    ignore_profiles=True)

    def get_maxlen(self):
        return max(map(len, (x for _, x in self.train + self.validation)))

    def write_vocab(self):
        with open(self._vocab_file_path, 'w') as fw:
            for key, value in self._vocab.iteritems():
                fw.write(str(key) + '\t' + str(value) + '\n')

    def calculate_label_ratio(self, labels):
        return collections.Counter(labels)

Loading resource...


In [None]:
class test_model(sarcasm_model):
    test = None
    model = None

    def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
                 input_weight_file_path=None):
        print('initializing...')
        sarcasm_model.__init__(self)

        self._model_file_path = model_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._vocab_file_path = vocab_file_path
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        print('test_maxlen', self._line_maxlen)

#     def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
#         start = time.time()
#         self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file)
#         end = time.time()
#         print('model loading time::', (end - start))

#     def __load_model(self, model_path, model_weight_path):
#         self.model = model_from_json(open(model_path).read())
#         print('model loaded from file...')
#         self.model.load_weights(model_weight_path)
#         print('model weights loaded from file...')

    def load_vocab(self):
        vocab = defaultdict()
        with open(self._vocab_file_path, 'r') as f:
            for line in f.readlines():
                key, value = line.split('\t')
                vocab[key] = value

        return vocab

    def predict(self, test_file, verbose=False):
        try:
            start = time.time()
            self.test = loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
                                    normalize_text=True, split_hashtag=True,
                                    ignore_profiles=False)
            end = time.time()
            if (verbose == True):
                print('test resource loading time::', (end - start))

            self._vocab = self.load_vocab()
            print('vocab loaded...')

            start = time.time()
            tX, tY, tD, tC, tA = vectorize_word_dimension(self.test, self._vocab)
            tX = pad_sequence_1d(tX, maxlen=self._line_maxlen)
            end = time.time()
            if (verbose == True):
                print('test resource preparation time::', (end - start))

            self.__predict_model(tX, self.test)
        except Exception as e:
            print('Error:', e)
            raise

    def __predict_model(self, tX, test):
        y = []
        y_pred = []
        before_softmax = []
        
#         prediction_probability = self.model.predict(tX, batch_size=1, verbose=1)
        without_softmax_prob = intermediate_layer_model.predict(tX,batch_size =1,verbose = 1)
#         try:
#             fd = open(self._output_file + '.analysis', 'w')
#             for i, (label) in enumerate(prediction_probability):
#                 gold_label = test[i][1]
#                 words = test[i][2]
#                 dimensions = test[i][3]
#                 context = test[i][4]
#                 author = test[i][5]

#                 predicted = numpy.argmax(prediction_probability[i])

#                 y.append(int(gold_label))
#                 y_pred.append(predicted)

#                 fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
#                          + str(gold_label) + '\t'
#                          + str(predicted) + '\t'
#                          + ' '.join(words))

#                 fd.write('\n')

#             print()

        try:
            fd = open(self._output_file + '.analysis', 'w')
            for i, (label) in enumerate(without_softmax_prob):
                gold_label = test[i][1]
                words = test[i][2]
                dimensions = test[i][3]
                context = test[i][4]
                author = test[i][5]

                predicted = np.array(without_softmax_prob[i])

                y.append(int(gold_label))
                y_pred.append(list(predicted))

                fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
                         + str(gold_label) + '\t'
                         + str(predicted) + '\t'
                         + ' '.join(words))

                fd.write('\n')
#             print('accuracy::', metrics.accuracy_score(y, y_pred))
#             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
#             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
#             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
#             print('f_score::', metrics.classification_report(y, y_pred))
            fd.close()
        except Exception as e:
            print(e)
            raise
        print(y_pred)
        
        

In [None]:
if __name__ == "__main__":
    print("......................")
    basepath = os.getcwd()[:os.getcwd().rfind('/')]
    train_file = basepath + '/resource/train/Train_v1.txt'
    validation_file = basepath + '/resource/dev/Dev_v1.txt'
    test_file = basepath + '/resource/test/data_file.txt'
    word_file_path = basepath + '/resource/word_list_freq.txt'
    split_word_path = basepath + '/resource/word_split.txt'
    emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'

    output_file = basepath + '/resource/text_model/TestResults.txt'
    model_file = basepath + '/resource/text_model/weights/'
    vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'

    word2vec_path = basepath + '/resource/text_model/weights/GoogleNews-vectors-negative300.bin'

    # uncomment for training
    tr = train_model(train_file=train_file, validation_file=validation_file, word_file_path=word_file_path,
                     split_word_path=split_word_path, emoji_file_path=emoji_file_path, model_file=model_file,
                     vocab_file=vocab_file_path, output_file=output_file, word2vec_path=word2vec_path)
    print("......................")

    t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
    t.predict(test_file)

......................
split entry found: 10942
0...100...200...300...400...500...600...700...800...900...1000...1100...1200...1300...1400...1500...1600...1700...1800...1900...2000...2100...2200...2300...2400...2500...2600...2700...2800...2900...3000...3100...3200...3300...3400...3500...3600...3700...3800...3900...4000...4100...4200...4300...4400...4500...4600...4700...4800...4900...5000...5100...5200...5300...5400...5500...5600...5700...5800...5900...6000...6100...6200...6300...6400...6500...6600...6700...6800...6900...7000...7100...7200...7300...7400...7500...7600...7700...7800...7900...8000...8100...8200...8300...8400...8500...8600...8700...8800...8900...9000...9100...9200...9300...9400...9500...9600...9700...9800...9900...10000...10100...10200...10300...10400...10500...10600...10700...10800...10900...11000...11100...11200...11300...11400...11500...11600...11700...11800...11900...12000...12100...12200...12300...12400...12500...12600...12700...12800...12900...13000...13100...13200...

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


class ratio:: [1.0, 1.151665945478148]
train_X (39780, 30)
train_Y (39780, 2)
validation_X (1605, 30)
validation_Y (1605, 2)
Build model...











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


No of parameter: 12637726
intermediate_layer_model: <keras.engine.training.Model object at 0x7f6358719a90>
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            (None, 30)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 30, 300)           11569500  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 28, 64)            57664     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 26, 128)           24704     
______________