In [1]:
import fastText
import math
import linecache
import numpy as np 
from numpy import random
from random import sample
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import *
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

# from attention_utils import get_activations, get_data_recurrent


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [3]:
ft = fastText.load_model("/home/jindal/notebooks/fastText/wiki.de.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [4]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    # print(words)
    return(words)

In [5]:
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims, tokenize=True):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [6]:
def sequential_generator(filename, 
                         batch_size, 
                         labels2Idx:'dict to make output labels',
                         check:'to check if all lines in file are of same length.To check enter the len of line after splitting it by tabs' = None,
                         tokenize:'specify if using twitter tokenzor to preprocess lines'=False, 
                        ):    
    
    f = open(filename)
    n_labels = len(labels2Idx)
    while True:
        batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
#         print(type(labels2Idx))
        batch_labels = np.zeros((batch_size, len(labels2Idx)))
        for i in range(batch_size):
            line = f.readline()
            if ("" == line):
                f.seek(0)
                line = f.readline()
            data = line.strip().split('\t')
            if check:
                if len(data)!=check:
#                     print('wrong')
                    i-=1
                    continue
            batch_features_ft[i] = process_features(data[0], nb_sequence_length, nb_embedding_dims, tokenize= tokenize)
            if len(labels2Idx)==2:
                batch_labels[i] = to_categorical(0 if data[1] == 'OTHER' else 1, n_labels)
            else:
                batch_labels[i] = to_categorical(labels2Idx[data[1]], n_labels)
        yield ([batch_features_ft], batch_labels)

In [7]:
def train_dev_sentences(filetrain,filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    labels2Idx = {}
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.strip().split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]

    train_sentences = [x[0] for x in train_lines]
    for dataset in [train_lines, dev_lines]:
        for line in dataset:
            label = line[1]
            if label not in labels2Idx.keys():
                labels2Idx[label]= len(labels2Idx)
                
#     train_labels = [0 if x[1] == "OTHER" else 1 for x in train_lines]
    train_labels = [labels2Idx[x[1]] for x in train_lines]
    dev_sentences = [x[0] for x in dev_lines]
#     dev_labels = [0 if x[1] == "OTHER" else 1 for x in dev_lines]
    dev_labels = [labels2Idx[x[1]] for x in dev_lines]
    return (train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx)


In [8]:
def compile_model(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)

    filter_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv1D(
            filters = 200,
            kernel_size = sz,
            padding = 'valid',
            strides = 1
        )(lstm_block)
        conv = LeakyReLU()(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.5)(conv)
        conv_blocks.append(conv)
    model_concatenated = concatenate([conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    # model_concatenated = Dropout(0.8)(model_concatenated)
    model_concatenated = Dense(100)(model_concatenated)
    model_concatenated = LeakyReLU()(model_concatenated)
    model_output = Dense(no_labels, activation = "softmax")(model_concatenated)
    new_model = Model(model_input_embedding, model_output)
    new_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
#     new_model.summary()
    return new_model

In [9]:
model = compile_model(2)

In [10]:
model.load_weights('classification_model_tl_twitterclasses7m_1000_suf_td.h5')

In [11]:
train_file = '/home/gwiedemann/notebooks/OffLang/sample_train.txt'
dev_file = '/home/gwiedemann/notebooks/OffLang/sample_dev.txt'

train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx= train_dev_sentences(train_file, dev_file, 3)


In [12]:
print(dev_sentences[1])

@LinnKuppitz @AlternativeNRW Seit wir Minister mit Migationshintergrund haben, so wie diese Integrationslotte Aydan Özoguz deren Brüder Islamisten sind!


In [14]:
testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
for i in range(len(dev_sentences)):
    testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
results = model.predict(testset_features)
print(len(results[0]))
predLabels = results.argmax(axis=-1)
devLabels = dev_labels

f1 = f1_score(devLabels, predLabels, average='macro', pos_label=1)
print(f1)


2
0.8153886772399017


In [18]:
print(results[0])
print(devLabels[0])

[0.03674595 0.9632541 ]
1


In [16]:
idx2Label = {v:k for k,v in labels2Idx.items() }

In [17]:
print(idx2Label)

{0: 'OTHER', 1: 'OFFENSE'}


In [28]:
with open('tl_twitterclasses1000_confidence_scores_task1.txt','w') as f:
    for i in range(len(devLabels)):
        confidence_list = results[i]
        correct_label = idx2Label[devLabels[i]]
        pred_label = idx2Label[confidence_list.argmax(axis=-1)]
        offense_confidence = confidence_list[labels2Idx['OFFENSE']]
        other_confidence = confidence_list[labels2Idx['OTHER']]
        text = dev_sentences[i] + '\t' + pred_label + '\t' + correct_label + '\t' + 'OFFENSE:' + str(offense_confidence)+' OTHER:'+str(other_confidence)+'\n'
        f.write(text)


In [None]:
print(predLabels[:10])


In [None]:
for i in range(len(devLabels)):
    tweet = dev_sentences[i]
    correct_label = idx2Label[dev_labels[i]]
    pred_label = idx2Label[predLabels[i]]
#     print("correct Label %s Predicted %s" %(correct_label, pred_label))
    pred_text = str(tweet)+'\t'+str(pred_label)+'\t'+str(pred_label)+'\n'
    gold_text = str(tweet)+'\t'+str(correct_label)+'\t'+str(correct_label)+'\n'
    with open('/home/jindal/notebooks/jindal/NER/language_model/predlabels_perl_twitterclasses7m_1000_suf_td.txt','a') as f:
        f.write(pred_text)
    with open('/home/jindal/notebooks/jindal/NER/language_model/goldlabels_perl_twitterclasses7m_1000_suf_td.txt','a') as f:
        f.write(gold_text)

In [None]:
with open('/home/jindal/notebooks/jindal/NER/language_model/error_analysis_task1_1000classestwiiter.txt','w') as f:
    f1 = open('/home/jindal/notebooks/jindal/NER/language_model/predlabels_perl_twitterclasses7m_1000_suf_td.txt','r').readlines()
    f2 = open('/home/jindal/notebooks/jindal/NER/language_model/goldlabels_perl_twitterclasses7m_1000_suf_td.txt','r').readlines()
    
#     print(f1[0])
    for i in range(len(f1)):
        tweet, pred, x = f1[i].split('\t')
        tweet , corr, y = f2[i].split('\t')
        text = tweet + '\t'+pred+'\t'+corr+'\n'
        f.write(text)