In [3]:
import fastText
import math
import linecache
import numpy as np 
from numpy import random
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import regularizers
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score


In [4]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=2

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


In [5]:
ft = fastText.load_model("/home/jindal/notebooks/fastText/wiki.de.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [None]:
SINGLE_ATTENTION_VECTOR = True
def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, nb_sequence_length))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(nb_sequence_length, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
    return output_attention_mul

In [7]:
print(ft.get_word_vector("🤢"))

[ 0.18421504  0.16461958  0.07163134 -0.3582153   0.6343416   0.57077825
 -0.44131115  0.47140062  0.35997692  1.0691209   0.5383094  -0.00801403
  0.58150095 -0.24640413  0.07941529 -0.7907748  -0.64057297  0.87790126
  0.1222318   0.9839732  -0.14147948  0.2741151   0.14327082  0.7455819
 -0.58181334 -0.0139227   0.13299793  0.11719222 -0.03907198 -0.98190314
  0.6551781  -0.08076547  0.39160377  0.5933445  -0.29222807 -0.02020206
 -0.17795676 -0.32914364 -0.9572954  -0.15258092 -0.0530088   0.31237698
  0.37407503  0.61072457 -0.0205325   0.00588962  0.3607436   0.3082963
 -0.3130489  -0.4344106  -0.4184202  -0.16960411  0.5402667  -0.00491837
  0.11402972 -0.3362505   0.5770166   0.13003364 -0.2651122   0.28100345
  0.07081287 -0.00930306 -0.18135485 -0.2852216  -0.04528273 -0.10418656
 -0.39689147  0.06003198 -0.20699514  0.13569123 -0.24864858 -0.1452383
 -0.08365332  0.04030139  0.11422046  0.16102162  0.2925926   0.40926033
 -0.49666372 -0.6249165  -0.57006294 -0.1205303  -0.35

In [6]:
def twitter_tokenizer(textline):
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    return(words)

In [None]:
def random_generator(features, labels, batch_size):
    
    batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
    batch_features_lg = np.zeros((batch_size, nb_sequence_length, nb_embedding2_dims))
    batch_labels = np.zeros((batch_size, 2))

    while True:
        # print(len(features))
        for i in range(batch_size):
            index = random.choice(len(features), 1)[0]
            batch_features_ft[i], batch_features_lg[i] = process_features(features[index], nb_sequence_length, nb_embedding_dims)
            # print(batch_features[i])
            # print(batch_features[i].shape)
            batch_labels[i] = labels[index]
        yield [batch_features_ft, batch_features_lg], batch_labels

In [16]:
def sequential_generator(filename, batch_size):
    
    file_length = sum(1 for line in open(filename, encoding = 'UTF-8'))
    shuffled_indexes = range(1, file_length + 1)
    index_position = 0
    
    batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
    batch_features_lg = np.zeros((batch_size, nb_sequence_length, nb_embedding2_dims))
    batch_labels = np.zeros((batch_size, 2)) # 2 because of simple task

    while True:
        # print(len(features))
        for i in range(batch_size):
            line = linecache.getline(filename, shuffled_indexes[index_position])
            data = line.strip().split('\t')
            batch_features_ft[i], batch_features_lg[i] = process_features(data[0], nb_sequence_length, nb_embedding_dims)
            # print(batch_features_ft[i])
            # print(batch_features_ft[i].shape)
            batch_labels[i] = to_categorical(0 if data[1] == 'OTHER' else 1, n_labels)
            index_position += 1
            if index_position == file_length:
                # shuffle indexes again
                shuffled_indexes = range(1, file_length + 1)
                index_position = 0
                break
        yield [batch_features_ft, batch_features_lg], batch_labels
        # yield [batch_features_ft], batch_labels

In [54]:
# word_splitter = re.compile("[\w+]|[\W+]", re.UNICODE)
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims):
    words = twitter_tokenizer(textline)
    # print(words)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_lg = np.zeros((nb_sequence_length, nb_embedding2_dims))
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        if w in word2Idx:
            wv = wordEmbeddings[word2Idx[w]]
        else:
            wv = wordEmbeddings[word2Idx["UNKNOWN_TOKEN"]]
        features_lg[idx] = wv
        
        idx = idx + 1
    return features_ft, features_lg

In [None]:
def charcnn_sequential_generator(filename, batch_size):
    
    file_length = sum(1 for line in open(filename, encoding = 'UTF-8'))
    shuffled_indexes = range(1, file_length + 1)
    index_position = 0
    
    # print("INITIIALIZING BATCHHH")

    while True:
        
        # print("PROOODUCING  BATCHHH")
        
        batch_word_embeddings = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
        batch_char_embeddings = []
        batch_labels = np.zeros((batch_size, 2))
    
        for i in range(batch_size):
            line = linecache.getline(filename, shuffled_indexes[index_position])
            data = line.strip().split('\t')
            batch_word_embeddings[i], tmp_char_embeddings = charcnn_process_features(data[0], nb_sequence_length, nb_embedding_dims)
            # print(tmp_char_embeddings.shape)
            batch_char_embeddings.append(tmp_char_embeddings)
            # print(batch_features_ft[i])
            # print(batch_features_ft[i].shape)
            batch_labels[i] = to_categorical(0 if data[1] == 'OTHER' else 1, n_labels)
            index_position += 1
            if index_position == file_length:
                # shuffle indexes again
                shuffled_indexes = range(1, file_length + 1)
                index_position = 0
                break
        if len(batch_char_embeddings) < batch_size:
            batch_word_embeddings = batch_word_embeddings[:len(batch_char_embeddings)]
            batch_labels = batch_labels[:len(batch_char_embeddings)]
        yield [batch_word_embeddings, np.array(batch_char_embeddings)], batch_labels

word_vectors_ft = {}
def charcnn_process_features(textline, nb_sequence_length, nb_embedding_dims):
    words = twitter_tokenizer(textline)
    # print(words)
    word_embeddings = np.zeros((nb_sequence_length, nb_embedding_dims))
    char_embeddings = np.zeros((nb_sequence_length, 52))
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        word_embeddings[idx] = wv
        
        temp_char = []
        for char in w:
            temp_char.append(char2Idx[char])
        for pos, c in enumerate(temp_char):
            char_embeddings[idx][pos] = c

        idx = idx + 1
    return word_embeddings, char_embeddings

In [7]:
train_lines = [line.strip().split("\t") for line in open('../OffLang/sample_train.txt', encoding = "UTF-8")]
dev_lines = [line.strip().split("\t") for line in open('../OffLang/sample_dev.txt', encoding = "UTF-8")]

In [8]:
train_sentences = [x[0] for x in train_lines]
train_labels = to_categorical([0 if x[1] == "OTHER" else 1 for x in train_lines])
# train_labels = [0 if x[1] == "OTHER" else 1 for x in train_lines]

dev_sentences = [x[0] for x in dev_lines]
dev_labels = to_categorical([0 if x[1] == "OTHER" else 1 for x in dev_lines])
# dev_labels = [0 if x[1] == "OTHER" else 1 for x in dev_lines]

In [9]:
n_labels = 2

In [None]:
characters={}
for line in train_sentences:
    for char in line:
        characters[char] = True
for line in dev_sentences:
    for char in line:
        characters[char] = True
char2Idx={}
for char in characters:
    char2Idx[char] = len(char2Idx)

In [52]:
word2Idx = {}
wordEmbeddings = []

# fEmbeddings = open("../embeddings/model_levy_goldberg_extended.embeddings", encoding="UTF-8")
fEmbeddings = open("../embeddings/embed_tweets_de_300M_52D", encoding="UTF-8")
for line in fEmbeddings:
    split = line.strip().split(" ")
    word = split[0]
    if len(split)-1 != 52:
        continue
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)
    
    vector = np.array([float(num) for num in split[1:]])
    wordEmbeddings.append(vector)
    word2Idx[split[0]] = len(word2Idx)

wordEmbeddings = np.array(wordEmbeddings)

In [53]:
nb_embedding2_dims = wordEmbeddings[1].shape[0]
print(nb_embedding2_dims)
print('für' in word2Idx)
print(wordEmbeddings[word2Idx['für']])
print('ute' in word2Idx)
print(wordEmbeddings[word2Idx['ute']])


52
True
[-0.479428 -1.072284 -0.234882 -0.181851 -0.38546  -0.047229 -1.079844
  0.999567  1.748121  0.26788   1.517133 -0.569696 -0.616138  0.537412
 -0.120462  0.897762 -1.192519 -0.96012  -0.18896   0.322002 -0.759193
  0.037441  0.448546  0.050203  0.498242 -0.367505  0.593616  0.654069
  0.237761 -0.970336 -0.04762  -0.596377 -0.428545 -0.933031  0.151061
 -0.417189  0.069623  0.545726  0.97318  -0.683228  0.729776 -0.954739
  0.084312 -0.679187 -0.35613   0.499245 -0.606557 -0.989023  0.940378
 -0.685228 -0.266337  0.141203]
True
[ 0.561929 -0.02499   0.393823 -0.18884   0.010402  0.308434 -0.196201
  0.172635  0.354278  0.018807  0.225498 -0.085069 -0.040902  0.04759
  0.150173 -0.249061 -0.144079  0.016082 -0.429589 -0.296965  0.27181
 -0.033312  0.156146  0.117304  0.447623  0.019845 -0.03712   0.109375
 -0.07226  -0.301825  0.132319 -0.856695 -0.17947  -0.232972  0.234949
  0.1591    0.026851 -0.361166 -0.297852  0.250002 -0.413429 -0.511242
 -0.384613  0.021699  0.051884 -0.

In [None]:
model = Sequential([
    LSTM(64, recurrent_dropout = 0.5, dropout = 0.5, activation = 'relu', input_shape=(nb_sequence_length, nb_embedding_dims)),
    Dense(32, activation = 'relu'),
    Dropout(0.2),
    Dense(2, activation = 'softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])

In [None]:
# model = Sequential([
#     Conv1D(128, kernel_size = 3, padding = 'valid', input_shape=(nb_sequence_length, nb_embedding_dims), activation = 'relu'),
#     MaxPooling1D(5),
#     Flatten(),
#     Dense(64, activation = 'relu'),
#     Dropout(0.2),
#     Dense(2, activation = 'softmax')
# ])
# model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])

In [55]:
filter_sizes = (3, 4, 5)
model_input_ft = Input(shape = (nb_sequence_length, nb_embedding_dims))
# model_layers = Dropout(0.1)(model_input)
conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(
        filters = 200,
        kernel_size = sz,
        padding = 'valid',
        strides = 1
    )(model_input_ft)
    conv = LeakyReLU()(conv)
    conv = GlobalMaxPooling1D()(conv)
    conv = Dropout(0.8)(conv)
    # conv = Flatten()(conv)
    conv_blocks.append(conv)
model_input_lg = Input(shape = (nb_sequence_length, nb_embedding2_dims))
# Variation 1: 
lstm_block = LSTM(100)(model_input_lg) # , dropout = 0.5, recurrent_dropout = 0.5
lstm_block = LeakyReLU()(lstm_block)
# Variation 2: 
# lstm_block = Bidirectional(LSTM(100))(model_input_lg)
# lstm_block = LeakyReLU()(lstm_block)
# Variation 3: 
# lstm_block = attention_3d_block(model_input)
# lstm_block = LSTM(100)(lstm_block)
# lstm_block = LeakyReLU()(lstm_block)
# Variation 4:
# lstm_block = LSTM(100, return_sequences = True)(model_input_ft)
# lstm_block = LeakyReLU()(lstm_block)
# lstm_block = AttentionDecoder(100, nb_sequence_length)(lstm_block)
# lstm_block = Flatten()(lstm_block)
model_concatenated = concatenate([lstm_block, conv_blocks[0], conv_blocks[1], conv_blocks[2]])
# model_concatenated = Dropout(0.8)(model_concatenated)
model_concatenated = Dense(64)(model_concatenated)
model_concatenated = LeakyReLU()(model_concatenated)
model_output = Dense(n_labels, activation = "softmax")(model_concatenated)
model = Model([model_input_ft, model_input_lg], model_output)
model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 75, 300)      0                                            
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 73, 200)      180200      input_7[0][0]                    
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 72, 200)      240200      input_7[0][0]                    
__________________________________________________________________________________________________
conv1d_12 (Conv1D)              (None, 71, 200)      300200      input_7[0][0]                    
__________________________________________________________________________________________________
input_8 (I

In [None]:
# word embeddings
words_input = Input(shape=(None,300), dtype='float32',name='words_input')
# character embeddings
character_input=Input(shape=(None,52,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5, name='dropout1')(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1, name='conv'))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(52), name='maxpool')(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)
# concatenation
output = concatenate([words_input, char])
output = Bidirectional(LSTM(200, return_sequences=False, dropout=0.50, recurrent_dropout=0.5))(output)
output = Dense(n_labels, activation='softmax')(output)
model_charcnn = Model(inputs=[words_input, character_input], outputs=[output])
model_charcnn.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
model_charcnn.summary()

In [56]:
samples_per_epoch = len(train_sentences)
epochs = 50
batch_size = 32
steps_per_epoch = math.ceil(samples_per_epoch / batch_size)
checkpoint = ModelCheckpoint('best_classification_model.h5', 
                             monitor='val_acc', 
                             verbose = 1, 
                             save_best_only = True, 
                             save_weights_only = True)

In [None]:
for a, b in charcnn_sequential_generator('../OffLang/sample_train.txt', 32):
    # featuretype {0,1}, batch_size, words, char index vector
    print(a[1].shape)
    print(a[1][0].shape)
    print(a[1][0][0].shape)
    print(a[1][0][0])
    break

In [57]:
dev_batch_size = len(dev_sentences)
model.fit_generator(
    sequential_generator('../OffLang/sample_train.txt', batch_size), 
    steps_per_epoch=steps_per_epoch, epochs=epochs,
    validation_data = sequential_generator('../OffLang/sample_dev.txt', dev_batch_size),
    validation_steps = math.ceil(len(dev_sentences) / dev_batch_size),
    callbacks = [checkpoint]
)
model.reset_states()

Epoch 1/50
Epoch 00001: val_acc improved from -inf to 0.69802, saving model to best_classification_model.h5
Epoch 2/50
Epoch 00002: val_acc improved from 0.69802 to 0.72649, saving model to best_classification_model.h5
Epoch 3/50
Epoch 00003: val_acc improved from 0.72649 to 0.75000, saving model to best_classification_model.h5
Epoch 4/50
Epoch 00004: val_acc improved from 0.75000 to 0.76238, saving model to best_classification_model.h5
Epoch 5/50
Epoch 00005: val_acc improved from 0.76238 to 0.76609, saving model to best_classification_model.h5
Epoch 6/50
Epoch 00006: val_acc improved from 0.76609 to 0.78094, saving model to best_classification_model.h5
Epoch 7/50
Epoch 00007: val_acc did not improve
Epoch 8/50
Epoch 00008: val_acc did not improve
Epoch 9/50
Epoch 00009: val_acc improved from 0.78094 to 0.78465, saving model to best_classification_model.h5
Epoch 10/50
Epoch 00010: val_acc did not improve
Epoch 11/50
Epoch 00011: val_acc did not improve
Epoch 12/50
Epoch 00012: val_acc

Epoch 32/50
Epoch 00032: val_acc did not improve
Epoch 33/50
Epoch 00033: val_acc did not improve
Epoch 34/50
Epoch 00034: val_acc did not improve
Epoch 35/50
Epoch 00035: val_acc did not improve
Epoch 36/50
Epoch 00036: val_acc did not improve
Epoch 37/50
Epoch 00037: val_acc did not improve
Epoch 38/50
Epoch 00038: val_acc did not improve
Epoch 39/50
Epoch 00039: val_acc did not improve
Epoch 40/50
Epoch 00040: val_acc did not improve
Epoch 41/50
Epoch 00041: val_acc did not improve
Epoch 42/50
Epoch 00042: val_acc did not improve
Epoch 43/50
Epoch 00043: val_acc did not improve
Epoch 44/50
Epoch 00044: val_acc did not improve
Epoch 45/50
Epoch 00045: val_acc did not improve
Epoch 46/50
Epoch 00046: val_acc did not improve
Epoch 47/50
Epoch 00047: val_acc did not improve
Epoch 48/50
Epoch 00048: val_acc did not improve
Epoch 49/50
Epoch 00049: val_acc did not improve
Epoch 50/50
Epoch 00050: val_acc did not improve


In [None]:
dev_batch_size = len(dev_sentences)
model_charcnn.fit_generator(
    charcnn_sequential_generator('../OffLang/sample_train.txt', batch_size), 
    steps_per_epoch=steps_per_epoch, epochs=epochs,
    validation_data = charcnn_sequential_generator('../OffLang/sample_dev.txt', dev_batch_size),
    validation_steps = math.ceil(len(dev_sentences) / dev_batch_size),
    callbacks = [checkpoint]
)
model_charcnn.reset_states()

In [None]:
model_charcnn.load_weights('best_classification_model.h5')
model_charcnn.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
testset_features_e1 = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
testset_features_e2 = np.zeros((len(dev_sentences), nb_sequence_length, 52))   
for i in range(len(dev_sentences)):
    testset_features_e1[i], testset_features_e2[i] = charcnn_process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
results = model_charcnn.predict([testset_features_e1, testset_features_e2])

In [None]:
idx2Label = {0 : "OTHER", 1 : "OFFENSIVE"}
predLabels = results.argmax(axis=-1)
devLabels = [0 if x[1] == "OTHER" else 1 for x in dev_lines]
# print(idx2Label)
# print(predLabels)
# print(devLabels)
f1 = f1_score(devLabels, predLabels, average='binary', pos_label=1)
r = recall_score(devLabels, predLabels, average='binary', pos_label=1)
p = precision_score(devLabels, predLabels, average='binary', pos_label=1)
a = accuracy_score(devLabels, predLabels)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f, Acc: %.3f" % (p, r, f1, a))

In [None]:
# FP / NP