In [262]:
import fastText
import math
import linecache
import numpy as np 
from numpy import random
from random import sample
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import *
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

# from attention_utils import get_activations, get_data_recurrent


In [263]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [264]:
ft = fastText.load_model("/home/jindal/notebooks/fastText/wiki.de.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [265]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    # print(words)
    return(words)

In [266]:
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims):
    words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [267]:
def sequential_generator(filename, batch_size, check:
                         'to check if all lines in file are of same length.To check enter the len of line after splitting it by tabs' = None):
    
    f = open(filename)
    while True:
        batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
        batch_labels = np.zeros((batch_size, 2))
        for i in range(batch_size):
            line = f.readline()
            if ("" == line):
                f.seek(0)
                line = f.readline()
            data = line.strip().split('\t')
            if check:
                if len(data)!=check:
                    i-=1
                    continue
            batch_features_ft[i] = process_features(data[0], nb_sequence_length, nb_embedding_dims)
            batch_labels[i] = to_categorical(0 if data[1] == 'OTHER' else 1, n_labels)
            
        yield ([batch_features_ft], batch_labels)

In [268]:
def train_dev_sentences(filetrain,filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.strip().split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]

    train_sentences = [x[0] for x in train_lines]
    train_labels = [0 if x[1] == "OTHER" else 1 for x in train_lines]

    dev_sentences = [x[0] for x in dev_lines]
    dev_labels = [0 if x[1] == "OTHER" else 1 for x in dev_lines]

    return (train_sentences, train_labels, dev_sentences, dev_labels)


In [269]:
train_sentences, train_labels, dev_sentences, dev_labels = train_dev_sentences(filetrain='/home/jindal/notebooks/jindal/NER/language_model/FB_train.csv',
                   filedev='/home/jindal/notebooks/jindal/NER/language_model/FB_dev.csv', check=2)

In [270]:
print(len(dev_sentences))

774


In [271]:
dev_sentences[0]

'Oh, etwa mal wieder Verbindungen nach Deutschland? Salafisten und sonstigem kriminellem und islam-extremistischem Mist, macht man es hier ja auch besonders gemütlich. Dürfte niemanden wirklich wundern.'

In [278]:
print(len(devLabels))

774


In [272]:
n_labels =2

In [273]:
def compile_model():
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)

    filter_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv1D(
            filters = 200,
            kernel_size = sz,
            padding = 'valid',
            strides = 1
        )(lstm_block)
        conv = LeakyReLU()(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.5)(conv)
        conv_blocks.append(conv)
    model_concatenated = concatenate([conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    # model_concatenated = Dropout(0.8)(model_concatenated)
    model_concatenated = Dense(100)(model_concatenated)
    model_concatenated = LeakyReLU()(model_concatenated)
    model_output = Dense(n_labels, activation = "softmax")(model_concatenated)
    new_model = Model(model_input_embedding, model_output)
    new_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
#     new_model.summary()
    return new_model

In [274]:
print(dev_sentences[1])

Die sollen sich besser mal um die Antifa & linksextreme Faschisten kümmern anstatt sich mit harmlosen Flyern zu beschäftigen.


In [282]:
def test_model(generator, train_sentences, devLabels, number_of_tests,number_of_epochs, filename_to_log, 
               filename_to_save_weigths, batch_size, train_file:'filepath for traininig',
               f1_measure:'binary/macro etc', pos_label:'only if binary f1',
               load_model_weights=False,model_weights_file:'give filepath as str'=None, 
               nb_sequence_length = nb_sequence_length, nb_embedding_dims= nb_embedding_dims, check_for_generator=None ):
    
    f = open(filename_to_log,"w")
    
    max_f1=0
    max_p=0
    max_r=0
    max_a=0
    total_f1=0
    total_prec=0
    total_acc=0
    total_recall=0
    
    for test_number in range(number_of_tests):
        print("Test %d/%d" %(test_number+1, number_of_tests))
        model = compile_model()

        # transfer learning
        if load_model_weights and model_weights_file:
                model.load_weights(model_weights_file)


        epochs = number_of_epochs
        batch_size = batch_size
        steps_per_epoch = math.ceil(samples_per_epoch / batch_size)
        # checkpoint = ModelCheckpoint('best_classification_model_million_post_corpus.h5', monitor='val_acc',save_best_only = True, 
    #                                  save_weights_only = True)

        for epoch in range(epochs):
            print("Epoch: %d" %(epoch+1))
            model.fit_generator(
                generator(train_file, batch_size, check_for_generator), 
                steps_per_epoch=steps_per_epoch, epochs=1,
        #         validation_data = sequential_generator('/home/jindal/notebooks/jindal/NER/language_model/million_post_corpus_dev.csv', batch_size),
        #         validation_steps = math.ceil(len(dev_sentences) / batch_size),
        #         callbacks = [checkpoint]
            )

            testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
            for i in range(len(dev_sentences)):
                testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
            results = model.predict(testset_features)


            idx2Label = {0 : "OTHER", 1 : "OFFENSIVE"}
            predLabels = results.argmax(axis=-1)
            devLabels = devLabels
            f1 = f1_score(devLabels, predLabels, average=f1_measure, pos_label=pos_label) # offensive is the major class. So other is minor
            r = recall_score(devLabels, predLabels, average=f1_measure, pos_label=pos_label)
            p = precision_score(devLabels, predLabels, average=f1_measure, pos_label=pos_label)
            a = accuracy_score(devLabels, predLabels)
            if max_f1 < f1:
                print("model saved. F1 is %f" %(f1))
                model.save(filename_to_save_weigths)
                max_f1 = f1
                max_p = p
                max_r = r
                max_a = a
            text = "prec: "+ str(p)+" rec: "+str(r) +" f1: "+str(f1) +" acc: "+str(a)+" \n"
            print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f, Acc: %.3f" % (p, r, f1, a))
        to_write= "prec: "+ str(max_p)+" rec: "+str(max_r) +" f1: "+str(max_r) +" acc: "+str(max_a)+" \n"
        f.write(to_write)
        total_f1+=max_f1
        total_prec+=max_p
        total_acc+=max_a
        total_recall+=max_r    
        print("*****************************************************************************")
    final_text = "avg_prec: " +str(total_prec/number_of_tests)+" total_rec: "+str(total_recall/number_of_tests) +" total_f1: "+str(total_f1/number_of_tests) +" total_acc: "
    +str(total_acc/number_of_tests)+" \n"
    f.write(final_text)
    f.close()




In [283]:
generator = sequential_generator
train_sentences = train_sentences
devLabels = dev_labels
number_of_tests = 3
number_of_epochs = 50
fb_pretraining_log = '/home/jindal/notebooks/jindal/NER/language_model/results_pretraining_FB.txt' 
fb_pretraining_save_weigths='/home/jindal/notebooks/jindal/NER/language_model/model_pretrained_FB.h5'
batch_size=32
fb_train_file='/home/jindal/notebooks/jindal/NER/language_model/FB_train.csv'
f1_measure='binary'
pos_label=0
load_model_weights=False
# model_weights_file:'give filepath as str'=None, 
nb_sequence_length = nb_sequence_length
nb_embedding_dims= nb_embedding_dims
check_for_generator=2



In [None]:
test_model(generator=generator, train_sentences=train_sentences, devLabels=devLabels, number_of_tests= number_of_tests,
          number_of_epochs=number_of_epochs, filename_to_log=fb_pretraining_log, filename_to_save_weigths=fb_pretraining_save_weigths,
          batch_size=batch_size,train_file=fb_train_file, f1_measure=f1_measure, pos_label=pos_label, load_model_weights=load_model_weights,
          nb_sequence_length=nb_sequence_length, nb_embedding_dims=nb_embedding_dims, check_for_generator= check_for_generator)

Test 1/3
Epoch: 1
Epoch 1/1
model saved. F1 is 0.183333
Test-Data: Prec: 0.647, Rec: 0.107, F1: 0.183, Acc: 0.747
Epoch: 2
Epoch 1/1
model saved. F1 is 0.327526
Test-Data: Prec: 0.580, Rec: 0.228, F1: 0.328, Acc: 0.751
Epoch: 3
Epoch 1/1
model saved. F1 is 0.468571
Test-Data: Prec: 0.569, Rec: 0.398, F1: 0.469, Acc: 0.760
Epoch: 4
Epoch 1/1
Test-Data: Prec: 0.609, Rec: 0.379, F1: 0.467, Acc: 0.770
Epoch: 5
Epoch 1/1
model saved. F1 is 0.509695
Test-Data: Prec: 0.594, Rec: 0.447, F1: 0.510, Acc: 0.771
Epoch: 6
Epoch 1/1
model saved. F1 is 0.525745
Test-Data: Prec: 0.595, Rec: 0.471, F1: 0.526, Acc: 0.774
Epoch: 7
Epoch 1/1
Test-Data: Prec: 0.629, Rec: 0.437, F1: 0.516, Acc: 0.782
Epoch: 8
Epoch 1/1
Test-Data: Prec: 0.584, Rec: 0.471, F1: 0.522, Acc: 0.770
Epoch: 9
Epoch 1/1
model saved. F1 is 0.591346
Test-Data: Prec: 0.586, Rec: 0.597, F1: 0.591, Acc: 0.780
Epoch: 10
Epoch 1/1
model saved. F1 is 0.615034
Test-Data: Prec: 0.579, Rec: 0.655, F1: 0.615, Acc: 0.782
Epoch: 11
Epoch 1/1
Test

Test-Data: Prec: 0.638, Rec: 0.607, F1: 0.622, Acc: 0.804
Epoch: 50
Epoch 1/1
Test-Data: Prec: 0.685, Rec: 0.558, F1: 0.615, Acc: 0.814
*****************************************************************************
Test 2/3
Epoch: 1
Epoch 1/1
Test-Data: Prec: 0.564, Rec: 0.107, F1: 0.180, Acc: 0.740
Epoch: 2
Epoch 1/1
Test-Data: Prec: 0.613, Rec: 0.238, F1: 0.343, Acc: 0.757
Epoch: 3
Epoch 1/1
Test-Data: Prec: 0.650, Rec: 0.325, F1: 0.434, Acc: 0.774
Epoch: 4
Epoch 1/1
Test-Data: Prec: 0.575, Rec: 0.374, F1: 0.453, Acc: 0.760
Epoch: 5
Epoch 1/1
Test-Data: Prec: 0.595, Rec: 0.427, F1: 0.497, Acc: 0.770
Epoch: 6
Epoch 1/1
Test-Data: Prec: 0.586, Rec: 0.461, F1: 0.516, Acc: 0.770
Epoch: 7
Epoch 1/1
Test-Data: Prec: 0.604, Rec: 0.563, F1: 0.583, Acc: 0.786
Epoch: 8
Epoch 1/1
Test-Data: Prec: 0.602, Rec: 0.529, F1: 0.563, Acc: 0.782
Epoch: 9
Epoch 1/1
Test-Data: Prec: 0.566, Rec: 0.519, F1: 0.542, Acc: 0.766
Epoch: 10
Epoch 1/1
Test-Data: Prec: 0.598, Rec: 0.592, F1: 0.595, Acc: 0.786
Epoch:

# TRANSFER LEARNING HERE


In [113]:
n_labels =2

In [244]:
train_sentences, train_labels, dev_sentences, dev_labels = train_dev_sentences(filetrain='/home/gwiedemann/notebooks/OffLang/sample_train.txt',
                   filedev='/home/gwiedemann/notebooks/OffLang/sample_dev.txt', check=3)

In [245]:
generator = sequential_generator
train_sentences = train_sentences
devLabels = dev_labels
number_of_tests = 5
number_of_epochs = 50
fb_tl_log = '/home/jindal/notebooks/jindal/NER/language_model/results_transfer_learning_fb.txt' 
fb_tl_save_weigths='/home/jindal/notebooks/jindal/NER/language_model/model_tl_FB.h5'
batch_size=32
fb_tl_train_file='/home/gwiedemann/notebooks/OffLang/sample_train.txt'
f1_measure='binary'
pos_label=1
load_model_weights=True
model_weights_file = '/home/jindal/notebooks/jindal/NER/language_model/classification_model_pretrained_FB.h5'
nb_sequence_length = nb_sequence_length
nb_embedding_dims= nb_embedding_dims
check_for_generator=3

In [None]:
test_model(generator=generator, train_sentences=train_sentences, devLabels=devLabels, number_of_tests= number_of_tests,
          number_of_epochs=number_of_epochs, filename_to_log=fb_t1_log, filename_to_save_weigths=fb_tl_save_weigths,
          batch_size=batch_size,train_file=fb_tl_train_file, f1_measure=f1_measure, pos_label=pos_label, load_model_weights=load_model_weights,
          model_weights_file = model_weights_file, nb_sequence_length=nb_sequence_length, nb_embedding_dims=nb_embedding_dims, check_for_generator= check_for_generator)