In [38]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from tqdm.auto import tqdm
import spacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from keras.preprocessing.text import Tokenizer
from numpy import array
from numpy import asarray
from numpy import zeros
nlp = spacy.load("en_core_web_sm")

In [39]:
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification

In [40]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch import LongTensor


In [41]:
from __future__ import print_function
import torch

In [42]:
directory = 'data/protechn_corpus_eval/train'
testDirectory = 'data/protechn_corpus_eval/test'
devDirectory = 'data/protechn_corpus_eval/dev'

In [43]:
import glob
import os, sys

In [44]:
torch.cuda.get_device_name(0)

'GeForce GTX 1650 Ti'

In [45]:
from nltk.corpus.reader import wordnet as wordnet

In [46]:
from transformers import AutoTokenizer
import nlpaug.augmenter.word as naw



In [47]:
import re
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.porter import PorterStemmer
import string

def preprocess_text(text, remove_stop = True, stem_words = False, remove_mentions_hashtags = True):
  """
  eg:
  input: preprocess_text("@water #dream hi hello where are you going be there tomorrow happening happen happens",  
  stem_words = True) 
  output: ['tomorrow', 'happen', 'go', 'hello']
  """
  

  if remove_mentions_hashtags:
      text = re.sub(r"@(\w+)", " ", text)
      text = re.sub(r"#(\w+)", " ", text)
      
  text = re.sub(r"[^\x00-\x7F]+", " ", text)
  regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
  nopunct = regex.sub(" ", text.lower())
  words = (''.join(nopunct)).split()
  
  if(remove_stop):
      words = [w for w in words if w not in STOP_WORDS]
      words = [w for w in words if len(w) > 2]  # remove a,an,of etc.
      
  if(stem_words):
      stemmer = PorterStemmer()
      words = [stemmer.stem(w) for w in words]
      
  return list(words)

New preprocessor

In [48]:
import pathlib
from pathlib import Path
import numpy as np

def read_data(directory):
    ids = []
    texts = []
    labels = []
    for f in directory.glob('*.txt'):
        id = f.name.replace('article', '').replace('.txt','')
        ids.append(id)
        texts.append(f.read_text(encoding='utf8'))
        labels.append(parse_label(f.as_posix().replace('.txt', '.labels.tsv')))
    # labels can be empty 
    return ids, texts, labels

def parse_label(label_path):
    labels = []
    f= Path(label_path)
    
    if not f.exists():
        return labels

    for line in open(label_path):
        parts = line.strip().split('\t')
        # print("parts", parts, int(parts[2]), int(parts[3]))
        labels.append([int(parts[2]), int(parts[3]), parts[1], 0, 0])
    labels = sorted(labels) 

    if labels:
        length = max([label[1] for label in labels]) 
        visit = np.zeros(length)
        res = []
        for label in labels:
            if sum(visit[label[0]:label[1]]):
                label[3] = 1
            else:
               visit[label[0]:label[1]] = 1
            res.append(label)
        return res 
    else:
        return labels

def clean_text(articles, ids):
    texts = []
    for article, id in zip(articles, ids):
        sentences = article.split('\n')
        start = 0
        end = -1
        res = []
        for sentence in sentences:
           start = end + 1
           end = start + len(sentence)  # length of sequence 
           if sentence != "": # if not empty line
               res.append([id, sentence, start, end])
        texts.append(res)
    return texts


In [49]:
context_aug = naw.ContextualWordEmbsAug(device="cuda")
syn_aug = naw.SynonymAug()

In [50]:
syn_aug.augment("This is a big hat satan")

'This is a big chapeau satan'

In [51]:
def is_rare_class(label): 
  return label == "Bandwagon" or label == "Obfuscation,Intentional_Vagueness,Confusion" or label == "Straw_Men" or  label == "Whataboutism" or label == "Red_Herring" 

In [52]:
def make_dev_dataset(directory):
  # This is mostly from the original preprocess.py file in their original code
  # However, I removed some redundant data and do preprocessing
    ids, orgtexts, labels = read_data(directory)
    texts = clean_text(orgtexts, ids)
    res = []
    prev_sentence = ""
    for text, label, oText in zip(texts, labels, orgtexts):
        # making positive examples
        tmp = [] 
        pos_ind = [0] * len(text)
        for l in label:
            for i, sen in enumerate(text):
              # Several sentence
                if l[0] >= sen[2] and l[0] < sen[3] and l[1] > sen[3]:
                    l[4] = 1
                    sentence = sen[1]
                    pro_sentence = " ".join(preprocess_text(sentence))
                    tmp.append([sen[0],pro_sentence]  + [l[2]])
                    pos_ind[i] = 1

                    # data augmentation
#                     print(sentence)
#                     fake_text = " ".join(preprocess_text(context_aug.augment(sentence)))
#                     print(fake_text)
#                     tmp.append(["aug"+sen[0], fake_text] + [l[2]])  

                elif l[0] != l[1] and l[0] >= sen[2] and l[0] < sen[3] and l[1] <= sen[3]:
                    sentence = (sen + l)[1]
                    # if the sentence is the same add the             
                    if sentence == prev_sentence: 
                        phrase = (oText[(sen + l)[4]:(sen + l)[5]])
                        tmp.append([(sen + l)[0], phrase , (sen + l)[6]])
                    else:    
                    # if the same add normally
                        pro_sentence = " ".join(preprocess_text((sen + l)[1]))
                        tmp.append([(sen + l)[0], pro_sentence , (sen + l)[6]])
                    pos_ind[i] = 1
#                     phrase = (oText[(sen + l)[4]:(sen + l)[5]])
#                     print("LABEL", (sen + l)[6])
#                     print("SENTENCE",pro_sentence)
#                     print("PRASE", phrase)
                    prev_sentence = sentence
#                     print("prep ", prev_sentence)
                    # data augmentation 
                    # This should be leave out for now as it's too slow
#                     syn_fake_text = " ".join(preprocess_text(syn_aug.augment(sentence)))
#                     tmp.append(["aug"+(sen + l)[0], syn_fake_text , (sen + l)[6]])  
  
                    if is_rare_class((sen + l)[6]):
                      # print((sen + l))
                      phrase = " ".join(preprocess_text(oText[(sen + l)[4]:(sen + l)[5]]))
                      tmp.append(["aug"+(sen + l)[0], phrase , (sen + l)[6]]) 
                      fake_text = " ".join(preprocess_text(context_aug.augment(sentence)))
                      tmp.append(["aug"+(sen + l)[0], fake_text , (sen + l)[6]])  
        # making negative examples
        for k, sen in enumerate(text):
            if pos_ind[k] != 1 and k % 3 == 0:
                tmp.append([(sen + l)[0], " ".join(preprocess_text((sen + l)[1])), "O"])     
        res.extend(tmp)         
    return res

In [53]:

def make_train_dataset(directory):
  # This is mostly from the original preprocess.py file in their original code
  # However, I removed some redundant data and do preprocessing
    ids, orgtexts, labels = read_data(directory)
    texts = clean_text(orgtexts, ids)
    res = []
    prev_sentence = ""
    prev_major_sentence = ""
    for text, label, oText in zip(texts, labels, orgtexts):
        # making positive examples
        tmp = [] 
        pos_ind = [0] * len(text)
        for l in label:
            for i, sen in enumerate(text):
                if l[0] >= sen[2] and l[0] < sen[3] and l[1] > sen[3]:
                    l[4] = 1
                    sentence = sen[1]
                    pro_sentence = " ".join(preprocess_text(sentence))
                    tmp.append([sen[0],pro_sentence]  + [l[2]])
                    pos_ind[i] = 1
                    l[0] = sen[3] + 1

                elif l[0] != l[1] and l[0] >= sen[2] and l[0] < sen[3] and l[1] <= sen[3]:
                    sentence = (sen + l)[1]
                    pro_sentence = " ".join(preprocess_text((sen + l)[1]))
                    tmp.append([(sen + l)[0], pro_sentence , (sen + l)[6]])
                    pos_ind[i] = 1
                    fake_text = " ".join(preprocess_text(context_aug.augment(sentence)))
                    tmp.append(["aug"+(sen + l)[0], fake_text , (sen + l)[6]])  
                    fake_text_add = pro_sentence + " " +prev_major_sentence
                    tmp.append(["add"+(sen + l)[0], fake_text_add , (sen + l)[6]]) 
                    if is_rare_class((sen + l)[6]):
                        fake_text = " ".join(preprocess_text(syn_aug.augment(sentence)))
                        tmp.append(["aug"+(sen + l)[0], fake_text , (sen + l)[6]])  
        # making negative examples
        for k, sen in enumerate(text):
            if pos_ind[k] != 1:
                prev_major_sentence = " ".join(preprocess_text((sen + l)[1]))
                tmp.append([(sen + l)[0], " ".join(preprocess_text((sen + l)[1])), "O"])     
        res.extend(tmp)         
    return res

In [54]:
def make_test_dataset(directory):
    # This is mostly from the original preprocess.py file in their original code
  # However, I removed some redundant data and do preprocessing
    ids, orgtexts, labels = read_data(directory)
    texts = clean_text(orgtexts, ids)
    res = []
    prev_sentence = ""
   
    for text, label, oText in zip(texts, labels, orgtexts):
        # making positive examples
        tmp = [] 
        pos_ind = [0] * len(text)
        for l in label:
            for i, sen in enumerate(text):
                if l[0] >= sen[2] and l[0] < sen[3] and l[1] > sen[3]:
                    l[4] = 1
                    sentence = sen[1]
                    pro_sentence = " ".join(preprocess_text(sentence))
                    tmp.append([sen[0],pro_sentence]  + [l[2]])
                    pos_ind[i] = 1
                    l[0] = sen[3] + 1

                elif l[0] != l[1] and l[0] >= sen[2] and l[0] < sen[3] and l[1] <= sen[3]:
                    sentence = (sen + l)[1]
                    pro_sentence = " ".join(preprocess_text((sen + l)[1]))
                    tmp.append([(sen + l)[0], pro_sentence , (sen + l)[6]])
                    pos_ind[i] = 1
                    
        # making negative examples
        for k, sen in enumerate(text):
            if pos_ind[k] != 1:
                tmp.append([(sen + l)[0], " ".join(preprocess_text((sen + l)[1])), "O"])      
        res.extend(tmp) 
    return res


In [55]:
label2idx = {'Name_Calling,Labeling': 0,
 'Doubt': 1,
 'Whataboutism': 2,
 'Loaded_Language': 3,
 'Straw_Men': 4,
 'Causal_Oversimplification': 5,
 'Exaggeration,Minimisation': 6,
 'Repetition': 7,
 'Red_Herring': 8,
 'Bandwagon': 9,
 'Black-and-White_Fallacy': 10,
 'Slogans': 11,
 'O': 12,
 'Appeal_to_Authority': 13,
 'Obfuscation,Intentional_Vagueness,Confusion': 14,
 'Appeal_to_fear-prejudice': 15,
 'Thought-terminating_Cliches': 16,
 'Reductio_ad_hitlerum': 17,
 'Flag-Waving': 18}
label2idx

{'Name_Calling,Labeling': 0,
 'Doubt': 1,
 'Whataboutism': 2,
 'Loaded_Language': 3,
 'Straw_Men': 4,
 'Causal_Oversimplification': 5,
 'Exaggeration,Minimisation': 6,
 'Repetition': 7,
 'Red_Herring': 8,
 'Bandwagon': 9,
 'Black-and-White_Fallacy': 10,
 'Slogans': 11,
 'O': 12,
 'Appeal_to_Authority': 13,
 'Obfuscation,Intentional_Vagueness,Confusion': 14,
 'Appeal_to_fear-prejudice': 15,
 'Thought-terminating_Cliches': 16,
 'Reductio_ad_hitlerum': 17,
 'Flag-Waving': 18}

In [56]:
labelSize = np.zeros(19)

In [57]:
# Combine to multi-labels and 
def make_multilabels_dataset(data):
    index = -1
    collection = []
    prev_sentence = ""
    for ID, sentence, label in data:
        label = label2idx[label]
        labelSize[label]+=1
        if sentence == prev_sentence:
            if label not in collection[index][2]:
                collection[index][2].append(label)
            else:
                continue
        else:
            collection.append([ID, sentence, [label]])
            index+=1
        prev_sentence = sentence

    return collection

In [58]:
dev_directory = pathlib.Path(devDirectory)

dev_data= make_dev_dataset(dev_directory)
dev_df = pd.DataFrame(dev_data, columns=["Id", "Sentence", "Label"])
X_dev = dev_df[["Sentence"]]
Y_dev = dev_df[["Label"]]


In [60]:
train_directory = pathlib.Path(directory)
org_data = make_test_dataset(train_directory)
train_data = make_multilabels_dataset(org_data)
train_df = pd.DataFrame(train_data, columns=["Id", "Sentence", "Label"])

train_df2= train_df[train_df['Sentence'].str.len() > 0]

X_train = train_df2[["Sentence"]]
Y_train = train_df2[["Label"]]
Y_train_org = pd.DataFrame(org_data, columns=["Id", "Sentence", "Label"])[["Label"]]

In [None]:
# train_df2

In [61]:
train_df2

Unnamed: 0,Id,Sentence,Label
0,111111112,pamela geller robert spencer founded anti musl...,[11]
1,111111112,added condemn behaviours views run counter sha...,[10]
2,111111112,geller atlas shrugs blog spencer jihad watch f...,[11]
3,111111112,blogs pair called bans entering striking blow ...,[3]
4,111111112,stage inflammatory speakers promote hate,[18]
...,...,...,...
14200,999001621,moon alabama fundraiser week,[12]
14201,999001621,pays write blog posts,[12]
14202,999001621,appreciated consider donation,[12]
14203,999001621,posted november permalink,[12]


In [64]:
Y_train_org.groupby(["Label"]).size()

Label
Appeal_to_Authority                              129
Appeal_to_fear-prejudice                         225
Bandwagon                                         13
Black-and-White_Fallacy                          123
Causal_Oversimplification                        199
Doubt                                            527
Exaggeration,Minimisation                        401
Flag-Waving                                      216
Loaded_Language                                 1832
Name_Calling,Labeling                            934
O                                              10335
Obfuscation,Intentional_Vagueness,Confusion       11
Red_Herring                                       26
Reductio_ad_hitlerum                              48
Repetition                                       465
Slogans                                          123
Straw_Men                                         12
Thought-terminating_Cliches                       72
Whataboutism                            

In [62]:
test_directory = pathlib.Path(testDirectory)

test_data= make_multilabels_dataset(make_test_dataset(test_directory))
test_df = pd.DataFrame(test_data, columns=["Id", "Sentence", "Label"])
test_df2 = test_df[(test_df["Sentence"].str.len() > 0)]
X_test = test_df2[["Sentence"]]
Y_test = test_df2[["Label"]]

In [63]:
test_df2.head()

Unnamed: 0,Id,Sentence,Label
0,111111111,geneva world health organisation chief wednesd...,[1]
1,111111111,transmission pronounced stronger director gene...,[13]
2,111111111,tedros voiced alarm plague madagascar behaved ...,[7]
3,111111111,pointed presence pneumonic version spreads eas...,[15]
4,111111111,praised rapid response madagascar authorities ...,[15]


In [948]:
# convert text to vector
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(X_train.Sentence.values)

# X_train_tok1 = tokenizer.texts_to_sequences(X_train.Sentence.values)
# X_test_tok1 = tokenizer.texts_to_sequences(X_test.Sentence.values)

In [72]:
from keras.preprocessing.sequence import pad_sequences

# vocab_size = len(tokenizer.word_index) + 1
# maxlen = int(max([len(i) for i in X_train_tok1]))
# # maxlen = 14

# X_train_tok1 = pad_sequences(X_train_tok1, padding='post', maxlen=maxlen)
# X_test_tok1 = pad_sequences(X_test_tok1, padding='post', maxlen=maxlen)

In [73]:
for x in X_test.Sentence.values:
    if x == "":
        print("ahoy", x)

In [74]:
from transformers import BertTokenizer, BertModel
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [75]:

# Tokenize our sentence with the BERT tokenizer.
# tokenized_text = 
# X_train_tok = 
# (X_train.Sentence.values)
X_train_tok = list(filter(lambda x: len(x), [bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize(x)) for x in X_train.Sentence.values]))
X_test_tok = list(filter(lambda x: len(x), [bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize(x)) for x in X_test.Sentence.values]))


In [76]:
seq_lengths = LongTensor(list([len(i) for i in X_train_tok]))
test_seq_lengths = LongTensor(list([len(i) for i in X_test_tok]))

In [77]:
# from keras.preprocessing.sequence import pad_sequences

# vocab_size = len(tokenizer.word_index) + 1
maxlen = 14
# int(max([len(i) for i in X_train_tok]))


vocab_size = 30000
X_train_tok = pad_sequences(X_train_tok, padding='post', maxlen=maxlen)
X_test_tok = pad_sequences(X_test_tok, padding='post', maxlen=maxlen)

maxlen

14

In [78]:
seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
test_seq_lengths, test_perm_idx = test_seq_lengths.sort(0, descending=True)

In [79]:
X_train_tok = X_train_tok[perm_idx]
X_test_tok = X_test_tok[test_perm_idx]

In [80]:
X_train_tok[1]

array([10696,  2094,  4490, 17266, 15928, 13910,  6313,  7401,  7327,
        7507, 15061,  8656, 16873,  4490])

In [81]:
# y_size = Y_train.groupby(["Label"]).size()
# nSamples = [0]*19
# for key in label2idx:
#     nSamples[label2idx[key]] = y_size[key]
# nSamples

In [82]:
# sumSamples = sum(nSamples)



In [83]:
# weights = [1 / (x / sumSamples) for x in nSamples]
# weights

In [84]:
# weights = torch.FloatTensor(weights).cuda()
# weights = torch.FloatTensor(weights)

In [85]:
np.array(list(label2idx.items()))

array([['Name_Calling,Labeling', '0'],
       ['Doubt', '1'],
       ['Whataboutism', '2'],
       ['Loaded_Language', '3'],
       ['Straw_Men', '4'],
       ['Causal_Oversimplification', '5'],
       ['Exaggeration,Minimisation', '6'],
       ['Repetition', '7'],
       ['Red_Herring', '8'],
       ['Bandwagon', '9'],
       ['Black-and-White_Fallacy', '10'],
       ['Slogans', '11'],
       ['O', '12'],
       ['Appeal_to_Authority', '13'],
       ['Obfuscation,Intentional_Vagueness,Confusion', '14'],
       ['Appeal_to_fear-prejudice', '15'],
       ['Thought-terminating_Cliches', '16'],
       ['Reductio_ad_hitlerum', '17'],
       ['Flag-Waving', '18']], dtype='<U43')

In [86]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_train_bi = mlb.fit_transform(Y_train["Label"])
Y_train_df = pd.DataFrame(mlb.fit_transform(Y_train["Label"]),columns=mlb.classes_)
Y_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [91]:
Y_train_df[Y_train_df.sum(axis=1)>1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
28,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
31,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
69,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
70,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13896,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13899,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
13900,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
13901,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
np.sum()

In [88]:
Y_test_df = pd.DataFrame(mlb.fit_transform(Y_test["Label"]),columns=mlb.classes_)

In [89]:
Y_test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [967]:
# 0th output
y0_train = Y_train_df[[0]].values
y0_test =  Y_test_df[[0]].values

# First output
y1_train = Y_train_df[[1]].values
y1_test =  Y_test_df[[1]].values

# Second output
y2_train = Y_train_df[[2]].values
y2_test =  Y_test_df[[2]].values

# Third output
y3_train = Y_train_df[[3]].values
y3_test =  Y_test_df[[3]].values

# Fourth output
y4_train = Y_train_df[[4]].values
y4_test =  Y_test_df[[4]].values

# Fifth output
y5_train = Y_train_df[[5]].values
y5_test =  Y_test_df[[5]].values

# Sixth output
y6_train = Y_train_df[[6]].values
y6_test =  Y_test_df[[6]].values

y7_train = Y_train_df[[7]].values
y7_test =  Y_test_df[[7]].values

y8_train = Y_train_df[[8]].values
y8_test =  Y_test_df[[8]].values

y9_train = Y_train_df[[9]].values
y9_test =  Y_test_df[[9]].values

y10_train = Y_train_df[[10]].values
y10_test =  Y_test_df[[10]].values

y11_train = Y_train_df[[11]].values
y11_test =  Y_test_df[[11]].values

y12_train = Y_train_df[[12]].values
y12_test =  Y_test_df[[12]].values

y13_train = Y_train_df[[13]].values
y13_test =  Y_test_df[[13]].values

y14_train = Y_train_df[[14]].values
y14_test =  Y_test_df[[14]].values

y15_train = Y_train_df[[15]].values
y15_test =  Y_test_df[[15]].values

y16_train = Y_train_df[[16]].values
y16_test =  Y_test_df[[16]].values

y17_train = Y_train_df[[17]].values
y17_test =  Y_test_df[[17]].values

y18_train = Y_train_df[[18]].values
y18_test =  Y_test_df[[18]].values


In [968]:
train_output = [y0_train, y1_train, y2_train, y3_train, y4_train, y5_train, y6_train, y7_train,
               y8_train, y9_train, y10_train, y11_train, y12_train, y13_train, y14_train, 
               y15_train, y16_train, y17_train, y18_train]

# train_output = [y0_train]

In [969]:
len(X_test_tok)

3938

In [970]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Bidirectional
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

In [971]:
import keras
import tensorflow as tf


config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 16 } ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [972]:
vocab_size

30000

In [973]:
input_1 = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100,  trainable=True)(input_1)
LSTM_Layer1 = LSTM(128, dropout=0.3, recurrent_dropout=0.3)(embedding_layer)

output0 = Dense(1, activation='sigmoid')(LSTM_Layer1)

prop_not_prop_model = Model(inputs=input_1, outputs=[output0])
prop_not_prop_model.compile(
    loss='binary_crossentropy',
    optimizer=keras.optimizers.Adam(lr=0.001),
    metrics=['mse']
)

In [974]:
X_train_tok 

array([[ 3819,  2819,  2512, ...,  6633, 24759,  2906],
       [ 6313,  7401,  7327, ..., 11060,  2204,  3114],
       [ 6313,  7401,  7327, ..., 11060,  2204,  3114],
       ...,
       [ 7864,     0,     0, ...,     0,     0,     0],
       [ 2933,     0,     0, ...,     0,     0,     0],
       [15931,     0,     0, ...,     0,     0,     0]], dtype=int32)

In [975]:
(y12_train[10000:].flatten() == np.ones(y12_train[10000:].flatten().shape)).astype(np.float).mean()

0.38884842063439506

In [976]:
# history = prop_not_prop_model.fit(x=X_train_tok[:10000], y=[y12_train[:10000]], batch_size=64, epochs=4, verbose=1)

In [977]:
prop_not_prop_model.predict(x=X_train_tok[10000:])

array([[0.49665564],
       [0.4994976 ],
       [0.50039816],
       ...,
       [0.49226165],
       [0.49221337],
       [0.49222732]], dtype=float32)

In [978]:
score = prop_not_prop_model.evaluate(x=X_train_tok[10000:], y=[y12_train[10000:]], verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.6913000707373415
Test Accuracy: 0.249076502742408


In [979]:
from keras.layers import Input, BatchNormalization, Dense
from keras.models import Model

# model1:
input1 = Input((10,))
bn1 = BatchNormalization()(input1)
out1 = Dense(2, activation='softmax')(bn1)
model1 = Model(input1, out1)

In [980]:
# bn = BatchNormalization()(input_1)
# Embedding(vocab_size, 100,  trainable=True)(bn)
sum(labelSize)

30117.0

In [981]:
total = sum(labelSize)
weights = [1 / (x / total) for x in labelSize]
weights

[10.062479117941864,
 21.666906474820145,
 125.4875,
 5.196169772256729,
 684.4772727272727,
 55.979553903345725,
 23.67688679245283,
 19.305769230769233,
 327.3586956521739,
 684.4772727272727,
 99.3960396039604,
 78.63446475195822,
 2.2581540076478968,
 88.31964809384164,
 684.4772727272727,
 43.08583690987124,
 138.15137614678898,
 233.46511627906978,
 43.39625360230547]

In [982]:
from tensorflow.keras import layers
from tensorflow.keras.metrics import Precision

from tensorflow.keras.layers import BatchNormalization


In [1014]:
input_1 = Input(shape=(maxlen,))
# bn = BatchNormalization()(input_1)

embedding_layer = Embedding(vocab_size, 100,  trainable=True)(input_1)
LSTM_Layer1 = Bidirectional(LSTM(256))(embedding_layer)
# LSTM_Layer1 = LSTM(128)(embedding_layer)
# LSTM_Layer1 = Bidirectional(LSTM_Layer1)

output0 = Dense(1, activation='sigmoid')(LSTM_Layer1)

output1 = Dense(1, activation='sigmoid')(LSTM_Layer1)
# output1 = BatchNormalization()(output1)
output2 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output3 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output4 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output5 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output6 = Dense(1, activation='sigmoid')(LSTM_Layer1)

output7 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output8 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output9 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output10 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output11 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output12 = Dense(1, activation='sigmoid')(LSTM_Layer1)

output13 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output14 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output15 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output16 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output17 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output18 = Dense(1, activation='sigmoid')(LSTM_Layer1)

model = Model(inputs=input_1, outputs=[output0, output1, output2, output3, output4, output5, output6, output7, output8, output9, output10, output11, output12, output13, output14, output15, output16, output17, output18])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
#     loss_weights=weights
)

In [1015]:
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_70 (InputLayer)           (None, 14)           0                                            
__________________________________________________________________________________________________
embedding_62 (Embedding)        (None, 14, 100)      3000000     input_70[0][0]                   
__________________________________________________________________________________________________
bidirectional_50 (Bidirectional (None, 512)          731136      embedding_62[0][0]               
__________________________________________________________________________________________________
dense_901 (Dense)               (None, 1)            513         bidirectional_50[0][0]           
__________________________________________________________________________________________________
dense_902 

In [None]:
history = model.fit(x=X_train_tok, y=[y0_train, y1_train, y2_train, y3_train, y4_train, y5_train, y6_train, y7_train, y8_train, y9_train, y10_train, y11_train, y12_train, y13_train, y14_train, y15_train, y16_train, y17_train, y18_train], batch_size=64, epochs=10, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
score = model.evaluate(x=X_test_tok, y=[y0_test, y1_test, y2_test, y3_test, y4_test, y5_test, y6_test, y7_test, y8_test, y9_test, y10_test, y11_test, y12_test, y13_test, y14_test, y15_test, y16_test, y17_test, y18_test], verbose=1)

print("test loss, test precision:", score)

In [None]:
test_output = [y0_test, y1_test, y2_test, y3_test, y4_test, y5_test, y6_test, y7_test, y8_test, y9_test, y10_test, y11_test, y12_test, y13_test, y14_test, y15_test, y16_test, y17_test, y18_test]

In [None]:
predictions = model.predict(X_test_tok)



In [None]:
predictions_transposed = np.stack(predictions).transpose(1, 0, 2)
thresholded = np.round(predictions_transposed.reshape(predictions_transposed.shape[0], predictions_transposed.shape[1]))

In [None]:
v = Y_test_df

In [None]:
predictions[12]

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(v, thresholded, average="weighted")

In [None]:
print('fscore: {}'.format(fscore))
# The support is the number of occurrences of each class in y_true.
precision

In [None]:
rounded_predictions = np.round(predictions)

In [None]:
precision_arr = []
recall_arr = []
fscore_arr = []
for i, predict in enumerate(rounded_predictions):
    precision, recall, fscore, support = score(test_output[i], predict)
    precision_arr.append(precision)
    recall_arr.append(recall)
    fscore_arr.append(fscore)

In [None]:
fscore_arr

In [1013]:
rounded_predictions[0]

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [998]:

# ADD Rhetorical_feature
"""
Parses the argument lexicon by Somasundaran, Ruppenhofer & Wiebe (2007):
http://people.cs.pitt.edu/~wiebe/pubs/papers/sigdial07.pdf
http://mpqa.cs.pitt.edu/lexicons/arg_lexicon/
Span identification task: Encodes whether a given token is contained in a
phrase that matches a rhetorical pattern.
Technique classification: Encodes whether a given fragment contains such a
rhetorically salient phrase.
"""
import re


path = '../data/arglex/'
strategies_5 = ['authority', 'doubt', 'emphasis', 'generalization', 'priority']
strategies_full = ['assessments', 'authority', 'causation', 'conditionals',
                   'contrast', 'difficulty', 'doubt', 'emphasis',
                   'generalization', 'inconsistency',
                   'inyourshoes', 'necessity', 'possibility', 'priority',
                   'rhetoricalquestion', 'structure', 'wants']
macros = ['modals', 'spoken', 'wordclasses', 'pronoun', 'intensifiers']
# macro -> list of expansions
expansions = dict()
# strategy -> list of regexes
regexes = dict()


def init(strategies, verbose=False):
    for macro in macros:
        with open(path + macro + '.tff') as f:
            for line in f:
                if line.startswith('#'):
                    # comment
                    continue
                line = line.strip()
                if len(line) == 0:
                    continue
                fields = line.split('=')
                macro_word = fields[0]
                expansion_list = fields[1][1:-1]  # Strip away { and }
                # The lists use both ', ' and ',' as separators.
                expansion_list = expansion_list.replace(', ', ',')
                expansion_list = expansion_list.split(',')
                expansion_list = '(' + '|'.join(expansion_list) + ')'
                expansions[macro_word] = expansion_list

    if verbose:
        print('Macros and their expansions:')
        for m in expansions:
            print(m, expansions[m])
        print()

    for strategy in strategies:
        regexes[strategy] = []
        with open(path + strategy + '.tff') as f:
            for line in f:
                if line.startswith('#'):
                    # comment
                    continue
                line = line.strip()
                if len(line) == 0:
                    continue
                for macro in expansions:
                    line = line.replace('(' + macro + ' )?',
                                        '(' + expansions[macro] + ' )?')
                    line = line.replace('(' + macro + ')', expansions[macro])
                line = line.replace('\\', '')
                regexes[strategy] += ['\\b' + line + '\\b']

    if verbose:
        print('Regexes for rhetorical strategies:')
        for s in regexes:
            print(s, regexes[s])
        print()



def find_rhetorical_strategies(token_list, strategy):
    sentence = ' '.join(token_list)
    token_indices = set()
    if strategy is 'any':
        strats = [s for s in regexes]
    else:
        strats = [strategy]
    for strategy in strats:
        for regex in regexes[strategy]:
            for match in re.finditer(regex, sentence):
                # print(strategy.upper(), '--', match.group(),
                #       '--', match.span(), '--', regex)
                start_idx = match.span()[0]
                end_idx = match.span()[1]
                # idx in the token list
                token_indices.add(sentence[:start_idx].count(' '))
                token_indices.add(sentence[:end_idx].count(' '))
    return token_indices


def parse_input_file_si(infile, outfile, full=True, indiv_cols=False):
    """
    full: if True, use all strategies, if false, use the 5 most important
           strategies. Used for generating the preprocessing description.
           Should match the initialization
    indiv_cols: if True, each rhetorical strategy is represented by its own
                column. If False, matches for any strategy are represented
                in a single joint feature column.
    """
    with open(infile, encoding='utf8') as f_in:
        lines = f_in.readlines()
        lines.append('eof\teof\teof\teof\teof\teof\n')

    if indiv_cols:
        strategies = [s for s in regexes]
    else:
        strategies = ['any']

    with open(outfile, 'w', encoding='utf8') as f_out:
        rows = []
        tokens = []
        prev_article = ''
        first_line = True
        for line in lines:

            # Comments + header
            if line.startswith('#'):
                f_out.write(line)
                continue
            if first_line:
                f_out.write('# rhetorical_features: ArguingLexicon (')
                if full:
                    f_out.write('full, ')
                else:
                    f_out.write('5 main strategies, ')
                if indiv_cols:
                    f_out.write('individual feature columns')
                else:
                    f_out.write('joint feature column')
                f_out.write(')\n')
                first_line = False
                labels = line.strip().split('\t')
                try:
                    doc_idx = labels.index('document_id')
                except ValueError:
                    doc_idx = 0
                try:
                    word_idx = labels.index('token')
                except ValueError:
                    word_idx = 4
                for strategy in strategies:
                    labels.append('arglex_' + strategy)
                f_out.write('\t'.join(labels) + '\n')
                continue

            line = line[:-1]  # Remove \n
            fields = line.split('\t')
            article = fields[doc_idx]
            word = fields[word_idx]

            if article != prev_article:
                for strategy in strategies:
                    indices = find_rhetorical_strategies(tokens, strategy)
                    rows_new = []
                    for i, row in enumerate(rows):
                        if i in indices:
                            rows_new.append(row + '\t1')
                        else:
                            rows_new.append(row + '\t0')
                    rows = rows_new
                for row in rows:
                    f_out.write(row + '\n')
                tokens = []
                rows = []
            tokens.append(word)
            rows.append(line)
            prev_article = article


def annotate_tc(in_file):
    strategies = [s for s in regexes]
    strategies.sort()
    matched_strategies = {}

    with open(in_file, encoding='utf8') as f:
        lines = f.readlines()

    with open(in_file, 'w', encoding='utf8') as f:
        f.write(lines[0].strip() + '\t' + '\t'.join(strategies) + '\n')
        for line in lines[1:]:
            f.write(line.strip())
            text = line.split('\t')[4].strip().lower()
            for strategy in strategies:
                matched = 0
                for regex in regexes[strategy]:
                    for match in re.finditer(regex, text):
                        matched = 1
                        break
                f.write('\t' + str(matched))
                if matched:
                    try:
                        matched_strategies[strategy] += 1
                    except KeyError:
                        matched_strategies[strategy] = 1
            f.write('\n')
    for matched_strat in matched_strategies:
        strategies.remove(matched_strat)
    matched_strategies = sorted(matched_strategies.items(),
                                key=lambda s: s[1], reverse=True)
    for s in matched_strategies:
        print(s)
    print("Strategies without occurrences:", strategies)
    print()


if __name__ == "__main__":
    ### Task 1: Span identification
    init(strategies_full)
    parse_input_file_si('../data/train-improved-sentiwordnet.tsv',
                        '../data/train-improved-sentiwordnet-arguingfullindiv.tsv')
    parse_input_file_si('../data/dev-improved-sentiwordnet.tsv',
                        '../data/dev-improved-sentiwordnet-arguingfullindiv.tsv')
    parse_input_file_si('../data/test-improved-sentiwordnet.tsv',
                        '../data/test-improved-sentiwordnet-arguingfullindiv.tsv')

    ### Task 2: Technique identification
    init(strategies_full)
    # These strategies don't actually appear in the training data:
    regexes.pop('inyourshoes')
    regexes.pop('doubt')
    # These strategies barely appear in the training data (<10 occurrences):
    regexes.pop('difficulty')
    regexes.pop('conditionals')
    regexes.pop('assessments')
    regexes.pop('rhetoricalquestion')
    annotate_tc('../data/tc-train.tsv')
    annotate_tc('../data/tc-dev.tsv')
    annotate_tc('../data/tc-test.tsv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/arglex/modals.tff'

In [None]:
# ADD SENTIMENT FEATURES
"""
Sentiment features for the span identification task.
Encodes how positive and how negative a given token is.
Based on SentiWordNet by Esuli & Sebastiani (2006):
https://github.com/aesuli/sentiwordnet
https://github.com/aesuli/SentiWordNet/blob/master/papers/LREC06.pdf
Also creates sentiment features we briefly used in preliminary experiments:
- Span identification: single feature encoding how positive or negative a given
  token is (based on SentiWords by Gatti, Guerini & Turchi, 2016,
  https://hlt-nlp.fbk.eu/technologies/sentiwords)
- Technique classification: two features encoding the SentiWordNet scores of
  the most positive and negative tokens in a given text fragment.
"""
from spacy.lang.en import English


SENTIWORDS = '../data/sentiment/SentiWords_1.1.txt'
SENTIWORDNET = '../data/sentiment/SentiWordNet_3.0.0.txt'


def parse_sentiwordnet(lexicon_file):
    """
    Creates a dict(str -> (float, float)) from words to positive and negative
    scores. If a word contains several entries, the scores are averaged.
    """
    lex = dict()
    with open(lexicon_file, encoding='utf8') as f:
        for line in f:
            if line.startswith('#'):
                # comment
                continue
            fields = line.strip().split('\t')
            if len(fields) < 6:
                # last line
                continue
            # postag    id  score_pos   score_neg   word#sense word2#sense  def
            pos = float(fields[2])
            neg = float(fields[3])
            for word in fields[4].split():
                word = word.split('#')[0]
                try:
                    prev_pos, prev_neg, count = lex[word]
                    lex[word] = (prev_pos + pos, prev_neg + neg, count + 1)
                except KeyError:
                    lex[word] = (pos, neg, 1)

    for word in lex:
        pos, neg, count = lex[word]
        lex[word] = (pos / count, neg / count)

    return lex


def parse_sentiwords(lexicon_file):
    lex = dict()
    prev_word = ''
    score = 0
    n_entries = 0

    lines = []
    with open(lexicon_file, encoding='utf8') as f:
        lines = f.readlines()
        lines += ['end-of-file\t0']

    for line in lines:
        if line.startswith('#'):
            # comment
            continue
        fields = line.split('\t')
        # word#pos    value
        word = fields[0].split('#')[0]
        value = float(fields[1])
        if word == prev_word:
            score += value
            n_entries += 1
        else:
            if n_entries > 0:
                lex[word] = score / n_entries
            n_entries = 0
            score = 0
        prev_word = word

    return lex


def annotate_tokens(lex, infile, outfile, nlp, sentiwordnet):
    with open(infile, encoding='utf8') as f_in:
        with open(outfile, 'w', encoding='utf8') as f_out:
            first_line = True
            for line in f_in:

                # Comments + header
                if line.startswith('#'):
                    f_out.write(line)
                    continue
                if first_line:
                    f_out.write('# sentiment_lexicon=')
                    if sentiwordnet:
                        f_out.write('SentiWordNet')
                    else:
                        f_out.write('SentiWords')
                    f_out.write('\n')
                    first_line = False
                    labels = line.strip().split('\t')
                    try:
                        word_idx = labels.index('token')
                    except ValueError:
                        word_idx = 4
                    if sentiwordnet:
                        labels.append('positive')
                        labels.append('negative')
                    else:
                        labels.append('sentiment')
                    f_out.write('\t'.join(labels) + '\n')
                    continue

                line = line[:-1]  # Remove \n
                word = line.split('\t')[word_idx].lower()
                if sentiwordnet:
                    try:
                        value = lex[word]
                    except KeyError:
                        # Try looking up a lemmatized version
                        value = lex.get(nlp(word)[0].lemma_, (0.0, 0.0))
                    f_out.write(line + '\t' + str(value[0]) +
                                '\t' + str(value[1]) + '\n')
                else:
                    # SentiWords
                    try:
                        value = lex[word]
                    except KeyError:
                        # Try looking up a lemmatized version
                        value = lex.get(nlp(word)[0].lemma_, 0.0)
                    f_out.write(line + '\t' + str(value) + '\n')


def annotate_sequences(lex, in_file, nlp):
    with open(in_file, encoding='utf8') as f:
        lines = f.readlines()

    with open(in_file, 'w', encoding='utf8') as f:
        f.write(lines[0].strip() + '\thighest_pos\thighest_neg\n')
        for line in lines[1:]:
            text = line.split('\t')[4]
            highest_pos, highest_neg = 0.0, 0.0
            for word in text.strip().split():
                word_clean = ''
                if word.isalpha():
                    word_clean = word.lower()
                else:
                    for c in word.lower():
                        if c.isalpha():
                            word_clean += c
                if not word_clean:
                    continue
                try:
                    pos, neg = lex[word_clean]
                except KeyError:
                    # Try looking up a lemmatized version
                    pos, neg = lex.get(nlp(word_clean)[0].lemma_, (0.0, 0.0))
                if pos > highest_pos:
                    highest_pos = pos
                if neg > highest_neg:
                    highest_neg = neg
            f.write(line.strip() + '\t' + str(highest_pos) + '\t' +
                    str(highest_neg) + '\n')


if __name__ == '__main__':
    ### Task 1: Span identification
    lex = parse_sentiwordnet(SENTIWORDNET)
    nlp = English()
    annotate_tokens(lex, '../data/train-improved.tsv',
                    '../data/train-improved-sentiwordnet.tsv', nlp, True)
    annotate_tokens(lex, '../data/dev-improved.tsv',
                    '../data/dev-improved-sentiwordnet.tsv', nlp, True)
    annotate_tokens(lex, '../data/test-improved.tsv',
                    '../data/test-improved-sentiwordnet.tsv', nlp, True)

