# Import argumentative features dataset

In [1]:
#Import all needed libraries and packages
import pandas as pd
from keras import backend as K
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tensorflow import keras
import torch
import tensorflow as tf
from sklearn.model_selection import train_test_split
import nltk
from nltk import word_tokenize
from nltk import StanfordTagger
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle
import os
from ast import literal_eval

In [2]:
#Importing already filtered out datasets from New York Times
print("NYT publishers and dataset size:")
nyt_edu_original= list(open("/data/output_txt/nyt-edu.txt"))
print("NYT-EDU original features:",len(nyt_edu_original ))
nyt_fin_original= list(open("/data/output_txt/nyt-fin.txt"))
print("NYT-FIN original features:",len(nyt_fin_original))
nyt_law_original= list(open("/data/output_txt/nyt-law.txt"))
print("NYT-LAW original features:",len(nyt_law_original))
nyt_med_original = list(open("/data/output_txt/nyt-med.txt"))
print("NYT-Med original features:",len(nyt_med_original))
nyt_mil_original= list(open("/data/output_txt/nyt-mil.txt"))
print("NYT-MILL original features:",len(nyt_mil_original))
nyt_pol_original = list(open("/data/output_txt/nyt-pol.txt"))
print("NYT-POL original features:",len(nyt_pol_original))

NYT publishers and dataset size:
NYT-EDU original features: 1881
NYT-FIN original features: 3100
NYT-LAW original features: 3553
NYT-Med original features: 1743
NYT-MILL original features: 2132
NYT-POL original features: 6886


In [3]:
def extract_arg_features(dataset):
    arg_features=[]
    arg_labels=[]
    arg_features_dict={}
    for s in dataset:
        text=s.split("\t")[1].split("\n")[0]
        text=text.replace('"',"")
        tokenized=nltk.sent_tokenize(text)
        if len(tokenized)<=100:
            sequence=literal_eval(s.split("\t")[2].split("\n")[0])
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            arg_features.append(sequence)
            arg_labels.append(label)
            arg_features_dict[text]=(sequence,label)
    return arg_features_dict,arg_features,arg_labels

In [4]:
#Argumentative features.3 labels
print("NYT publishers and dataset size of argumentative features with 3 labels:Claim, Premises,None")
nyt_edu_arg = list(open("/data/output_txt/nyt-edu-argfeat.txt"))
edu_arg_3,edu_arg_3_features,edu_arg_3_labels=extract_arg_features(nyt_edu_arg)
print("NYT-EDU argumentative features:",len(edu_arg_3))
nyt_fin_arg = list(open("/data/output_txt/nyt-fin-argfeat.txt"))
fin_arg_3,fin_arg_3_features,fin_arg_3_labels=extract_arg_features(nyt_fin_arg)
print("NYT-FIN argumentative features:",len(fin_arg_3))
nyt_law_arg = list(open("/data/output_txt/nyt-law-argfeat.txt"))
law_arg_3,law_arg_3_features,law_arg_3_labels=extract_arg_features(nyt_law_arg)
print("NYfT-LAW argumentative features:",len(law_arg_3))
nyt_med_arg = list(open("/data/output_txt/nyt-med-argfeat.txt"))
med_arg_3,med_arg_3_features,med_arg_3_labels=extract_arg_features(nyt_med_arg)
print("NYT-Med argumentative features:",len(med_arg_3))
nyt_mil_arg = list(open("/data/output_txt/nyt-mil-argfeat.txt"))
mil_arg_3,mil_arg_3_features,mil_arg_3_labels=extract_arg_features(nyt_mil_arg)
print("NYT-MILL argumentative features:",len(mil_arg_3))
nyt_pol_arg = list(open("/data/output_txt/nyt-pol-argfeat.txt"))
pol_arg_3,pol_arg_3_features,pol_arg_3_labels=extract_arg_features(nyt_pol_arg)
print("NYT-POL argumentative features:",len(pol_arg_3))

NYT publishers and dataset size of argumentative features with 3 labels:Claim, Premises,None
NYT-EDU argumentative features: 1833
NYT-FIN argumentative features: 3061
NYfT-LAW argumentative features: 3520
NYT-Med argumentative features: 1699
NYT-MILL argumentative features: 2113
NYT-POL argumentative features: 6826


In [5]:
#Argumentative features.6 labels
print("NYT publishers and dataset size of argumentative features with 3 labels:'assumption','anecdote','testimony','statistics','other','common-ground'")
nyt_edu_6_arg = list(open("nyt-edu-argfeat-6.txt"))
edu_arg_6,edu_arg_6_features,edu_arg_6_labels=extract_arg_features(nyt_edu_6_arg)
print("NYT-EDU arg features:",len(edu_arg_6))
nyt_fin_6_arg = list(open("/data/output_txt/nyt-fin-argfeat-6.txt"))
fin_arg_6,fin_arg_6_features,fin_arg_6_labels=extract_arg_features(nyt_fin_6_arg)
print("NYT-FIN arg features:",len(fin_arg_6))
nyt_law_6_arg = list(open("/data/output_txt/nyt-law-argfeat-6.txt"))
law_arg_6,law_arg_6_features,law_arg_6_labels=extract_arg_features(nyt_law_6_arg)
print("NYT-LAW arg features:",len(law_arg_6))
nyt_med_6_arg = list(open("/data/output_txt/nyt-med-argfeat-6.txt"))
med_arg_6,med_arg_6_features,med_arg_6_labels=extract_arg_features(nyt_med_6_arg)
print("NYT-Med arg features:",len(med_arg_6))
nyt_mil_6_arg = list(open("/data/output_txt/nyt-mil-argfeat-6.txt"))
mil_arg_6,mil_arg_6_features,mil_arg_6_labels=extract_arg_features(nyt_mil_6_arg)
print("NYT-MILL arg features:",len(mil_arg_6))
nyt_pol_6_arg = list(open("/data/output_txt/nyt-pol-argfeat-6.txt"))
pol_arg_6,pol_arg_6_features,pol_arg_6_labels=extract_arg_features(nyt_pol_6_arg)
print("NYT-POL arg features:",len(pol_arg_6))

NYT publishers and dataset size of argumentative features with 3 labels:'assumption','anecdote','testimony','statistics','other','common-ground'
NYT-EDU arg features: 1831
NYT-FIN arg features: 3059
NYT-LAW arg features: 3520
NYT-Med arg features: 1699
NYT-MILL arg features: 2113
NYT-POL arg features: 6826


# Import BERT embeddings

In [6]:
def get_bert_embeddings(name):
    directory = '/data/BertEmbeddings/'+str(name)
    # Create empty dictionary to save data
    bert_dict= {}
    # Loop over files and read pickles
    for file in os.listdir(directory):
        if file.endswith('.pickle'):
            with open(str("/data/BertEmbeddings/"+str(name)+"/"+str(file)), 'rb') as f:
                bert_f=pickle.load(f)
                article= bert_f['article']
                article=article.replace('"',"")
                tokenized=nltk.sent_tokenize(article)
                if len(tokenized)<=100:
                    label= bert_f['label']
                    if label=='editorial':
                        label=0
                    elif label=='news':
                        label=1
                    bert_emb= bert_f['bert_emb'][0]
                    bert_dict[article] =tf.convert_to_tensor(bert_emb.cpu())
    return bert_dict

In [8]:
#education
edu_bert=get_bert_embeddings("NytEduBert")
print("Length Arg. Feature=Embeddings:","Education",len(edu_arg_3)==len(edu_bert))
#finances
fin_bert=get_bert_embeddings("NytFinBert")
print("Length Arg. Feature=Embeddings:","Finances-",len(fin_arg_3)==len(fin_bert))
#law 
law_bert=get_bert_embeddings("NytLawBert")
print("Length Arg. Feature=Embeddings:","Law-",len(law_arg_3)==len(law_bert))
#medicine 
med_bert=get_bert_embeddings("NytMedBert")
print("Length Arg. Feature=Embeddings:","Medicine-",len(med_arg_3)==len(med_bert))
#military
mil_bert=get_bert_embeddings("NytMillBert")
print("Length Arg. Feature=Embeddings:","Military-",len(mil_arg_3)==len(mil_bert))
#politics
pol_bert=get_bert_embeddings("NytPolBert")
print("Length Arg. Feature=Embeddings:","Politics-",len(pol_arg_3)==len(pol_bert))

Length Arg. Feature=Embeddings: Education True
Length Arg. Feature=Embeddings: Finances- True
Length Arg. Feature=Embeddings: Law- True
Length Arg. Feature=Embeddings: Medicine- True
Length Arg. Feature=Embeddings: Military- True
Length Arg. Feature=Embeddings: Politics- True


In [13]:
def extract_features(unsorted_arg_dataset,unsorted_bert_dataset):
    sorted_keys=sorted(unsorted_arg_dataset.keys())
    sorted_bert= []
    sorted_arg = []
    labels=[]
    for key in sorted_keys:
        tensor=unsorted_bert_dataset[key]
        sorted_bert.append(tensor)
    for key in sorted_keys:
        sorted_arg.append(tf.convert_to_tensor(unsorted_arg_dataset[key][0]))
        labels.append(unsorted_arg_dataset[key][1])
    return sorted_arg,sorted_bert,labels

In [14]:
#education
edu_features=extract_features(edu_arg_3,edu_bert)[0]
edu_embedding=extract_features(edu_arg_3,edu_bert)[1]
edu_labels=extract_features(edu_arg_3,edu_bert)[2]
#edu_bert.clear()
#torch.cuda.empty_cache()
#finances
fin_features=extract_features(fin_arg_3,fin_bert)[0]
fin_embedding=extract_features(fin_arg_3,fin_bert)[1]
fin_labels=extract_features(fin_arg_3,fin_bert)[2]
#fin_bert.clear()
#torch.cuda.empty_cache()
#law
law_features=extract_features(law_arg_3,law_bert)[0]
law_embedding=extract_features(law_arg_3,law_bert)[1]
law_labels=extract_features(law_arg_3,law_bert)[2]
#law_bert.clear()
#torch.cuda.empty_cache()
#medicine
medicine_features=extract_features(med_arg_3,med_bert)[0]
medicine_embedding=extract_features(med_arg_3,med_bert)[1]
medicine_labels=extract_features(med_arg_3,med_bert)[2]
#med_bert.clear()
#torch.cuda.empty_cache()
#military
mil_features=extract_features(mil_arg_3,mil_bert)[0]
mil_embedding=extract_features(mil_arg_3,mil_bert)[1]
mil_labels=extract_features(mil_arg_3,mil_bert)[2]
#mil_bert.clear()
#torch.cuda.empty_cache()
#politics
pol_features=extract_features(pol_arg_3,pol_bert)[0]
pol_embedding=extract_features(pol_arg_3,pol_bert)[1]
pol_labels=extract_features(pol_arg_3,pol_bert)[2]
#pol_bert.clear()
#torch.cuda.empty_cache()

# Only RNN. Argumentative features RNN

In [21]:
from keras import Input, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Concatenate, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.layers import Bidirectional, InputLayer, SimpleRNN
from keras.constraints import maxnorm
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint
from itertools import chain, repeat, islice

In [22]:
def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [23]:
#Argumentative features RNN  architecture
input_arg_only= Input(shape=(100,))
model_arg_only= Embedding(3, 128)(input_arg_only)
model_arg_only= SimpleRNN(128, dropout=0.2)(model_arg_only)
dense_pred_arg_only=(Dense(1, activation='sigmoid'))(model_arg_only)

model_arg_only= Model(inputs=input_arg_only, outputs=dense_pred_arg_only)
model_arg_only.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_arg_only.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 128)          384       
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 33,409
Trainable params: 33,409
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
#Validation data extraction.Traning data: NYT politics dataset.
X_train_sent, X_test_sent, y_train_sent, y_test_sent= train_test_split(pol_arg_3_features,pol_arg_3_labels, 
    test_size=0.2, random_state= 42)
#Fit the Argumentative features RNN to the actual data.
model_arg_only.fit(np.array(X_train_sent),np.array(y_train_sent),
          batch_size=32,
          epochs=5,
          validation_data=(np.array(X_test_sent),np.array(y_test_sent)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fac00414dc0>

In [17]:
print("-----Test 3 labels model-----")
for features, labels, name in zip([np.array(edu_arg_3_features),np.array(fin_arg_3_features),np.array(law_arg_3_features),np.array(med_arg_3_features),np.array(mil_arg_3_features)],[np.array(edu_arg_3_labels),np.array(fin_arg_3_labels),np.array(law_arg_3_labels),np.array(med_arg_3_labels),np.array(mil_arg_3_labels)],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    x_test = sequence.pad_sequences(features, maxlen=100)
    score, acc,f1 = model_arg_only.evaluate(x_test, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

-----Test 3 labels model-----
nyt_edu
Test score: 0.5446839928627014
Test accuracy: 0.7261320352554321
F1 score: 0.8364138603210449
nyt_fin
Test score: 0.44299113750457764
Test accuracy: 0.8235870599746704
F1 score: 0.9012267589569092
nyt_law
Test score: 0.5422341227531433
Test accuracy: 0.7355113625526428
F1 score: 0.8446060419082642
nyt_med
Test score: 0.5280036330223083
Test accuracy: 0.7357268929481506
F1 score: 0.8434920907020569
nyt_mil
Test score: 0.455048531293869
Test accuracy: 0.7955513596534729
F1 score: 0.8845686912536621


In [18]:
#Argumentative features RNN  architecture
input_arg_6_only= Input(shape=(100,))
model_arg_6_only= Embedding(6, 128)(input_arg_6_only)
model_arg_6_only= SimpleRNN(128, dropout=0.2)(model_arg_6_only)
dense_pred_arg_6_only=(Dense(1, activation='sigmoid'))(model_arg_6_only)

model_arg_6_only= Model(inputs=input_arg_6_only, outputs=dense_pred_arg_6_only)
model_arg_6_only.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_arg_6_only.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 128)          768       
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 33,793
Trainable params: 33,793
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
#Validation data extraction.Traning data: NYT politics dataset.
X_train_sent_6, X_test_sent_6, y_train_sent_6, y_test_sent_6= train_test_split(pol_arg_6_features,pol_arg_6_labels, 
    test_size=0.2, random_state= 42)
#Fit the Argumentative features RNN to the actual data.
model_arg_6_only.fit(np.array(X_train_sent_6),np.array(y_train_sent_6),
          batch_size=32,
          epochs=5,
          validation_data=(np.array(X_test_sent_6),np.array(y_test_sent_6)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f67a4cad610>

In [20]:
print("-----Test 6 labels model-----")
for features, labels, name in zip([np.array(edu_arg_6_features),np.array(fin_arg_6_features),np.array(law_arg_6_features),np.array(med_arg_6_features),np.array(mil_arg_6_features)],[np.array(edu_arg_6_labels),np.array(fin_arg_6_labels),np.array(law_arg_6_labels),np.array(med_arg_6_labels),np.array(mil_arg_6_labels)],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    x_test = sequence.pad_sequences(features, maxlen=100)
    score, acc,f1 = model_arg_6_only.evaluate(x_test, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

-----Test 6 labels model-----
nyt_edu
Test score: 0.5812196135520935
Test accuracy: 0.7258328795433044
F1 score: 0.8359654545783997
nyt_fin
Test score: 0.4660411775112152
Test accuracy: 0.82347172498703
F1 score: 0.9011202454566956
nyt_law
Test score: 0.567742109298706
Test accuracy: 0.7355113625526428
F1 score: 0.8446060419082642
nyt_med
Test score: 0.5719174742698669
Test accuracy: 0.7357268929481506
F1 score: 0.8434920907020569
nyt_mil
Test score: 0.4870593845844269
Test accuracy: 0.7955513596534729
F1 score: 0.8845686912536621


# Only BERT

In [11]:
#BERT embeddings model architecture
from tensorflow.keras import layers
from tensorflow.keras import losses
bert_model = tf.keras.Sequential()
bert_model.add(tf.keras.Input(shape=(768)))
bert_model.add(tf.keras.layers.Dense(1))
bert_model.summary()

bert_model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0),f1_score ])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 1)                 769       
                                                                 
Total params: 769
Trainable params: 769
Non-trainable params: 0
_________________________________________________________________


In [15]:
X_train_embedding_bert, X_test_embedding_bert, y_train_embedding_bert, y_test_embedding_bert= train_test_split(pol_embedding,pol_labels, 
    test_size=0.2, random_state= 8)
X_train_embedding_bert=tf.stack(X_train_embedding_bert, axis=0)
X_test_embedding_bert=tf.stack(X_test_embedding_bert, axis=0)

In [25]:
#Just the embedding.
import numpy as np
bert_model.fit(X_train_embedding_bert,np.array(y_train_embedding_bert),
          batch_size=16,
          epochs=5,
          validation_data=(X_test_embedding_bert,np.array(y_test_embedding_bert)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fab7829a070>

In [26]:
#education tranform into a tensor
edu_features=tf.stack(edu_features, axis=0)
edu_embedding=tf.stack(edu_embedding, axis=0)
edu_labels=np.array(edu_labels)

#finances
fin_features=tf.stack(fin_features, axis=0)
fin_embedding=tf.stack(fin_embedding, axis=0)
fin_labels=np.array(fin_labels)

#law
law_features=tf.stack(law_features, axis=0)
law_embedding=tf.stack(law_embedding, axis=0)
law_labels=np.array(law_labels)

#medicine
medicine_features=tf.stack(medicine_features, axis=0)
medicine_embedding=tf.stack(medicine_embedding, axis=0)
medicine_labels=np.array(medicine_labels)

#military
mil_features=tf.stack(mil_features, axis=0)
mil_embedding=tf.stack(mil_embedding, axis=0)
mil_labels=np.array(mil_labels)

In [27]:
print("-----Test only BERT emb. model-----")
for embeddings, labels, name in zip([edu_embedding,fin_embedding,law_embedding,medicine_embedding,mil_embedding],
                                    [np.array(edu_labels),np.array(fin_labels),np.array(law_labels),np.array(medicine_labels),np.array(mil_labels)],
                         ['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc,f1= bert_model.evaluate(embeddings, labels)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

-----Test only BERT emb. model-----
nyt_edu
Test score: 0.38919612765312195
Test accuracy: 0.8325150012969971
F1 score: 0.866670548915863
nyt_fin
Test score: 0.3642500042915344
Test accuracy: 0.8428618311882019
F1 score: 0.8861642479896545
nyt_law
Test score: 0.43543386459350586
Test accuracy: 0.8059659004211426
F1 score: 0.8558492064476013
nyt_med
Test score: 0.4120257794857025
Test accuracy: 0.8175397515296936
F1 score: 0.8588746190071106
nyt_mil
Test score: 0.34885403513908386
Test accuracy: 0.8632276654243469
F1 score: 0.8923753499984741


# RNN+BERT merged model

In [26]:
from keras import Input, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Concatenate, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.layers import Bidirectional, InputLayer, SimpleRNN
from keras.constraints import maxnorm
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint

input_emb = Input(shape=(768,))
dense_1 = Dense(128, activation='relu', activity_regularizer=l2(0.0001))(input_emb)
dropout_1 = Dropout(0.5)(dense_1)
# dense_2 = Dense(128, activation='sigmoid', activity_regularizer=l2(0.0001))(input_emb)
# dropout_2 = Dropout(0.5)(dense_1)

input_arg = Input(shape=(100,))
model_arg = Embedding(3, 128)(input_arg)
model_arg = SimpleRNN(128, dropout=0.2)(model_arg)

merged = Concatenate()([dropout_1, model_arg])
dense_pred = (Dense(1, activation='sigmoid'))(merged)

model_merged_3= Model(inputs=[input_emb, input_arg], outputs=dense_pred)
model_merged_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_merged_3.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 dense_3 (Dense)                (None, 128)          98432       ['input_4[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, 100, 128)     384         ['input_5[0][0]']                
                                                                                            

In [27]:
X_train_embedding, X_test_embedding, y_train_embedding, y_test_embedding= train_test_split(pol_embedding,pol_labels, 
    test_size=0.2, random_state= 8)
X_train_features, X_test_features, y_train_features, y_test_features= train_test_split(pol_features,pol_labels, 
    test_size=0.2, random_state= 8)
#We need to stack the embeddings
X_train_embedding=tf.stack(X_train_embedding, axis=0)
X_train_features=tf.stack(X_train_features, axis=0)
X_test_embedding=tf.stack(X_test_embedding, axis=0)
X_test_features=tf.stack(X_test_features, axis=0)

In [28]:
#Merged 3 labels
import numpy as np
model_merged_3.fit([X_train_embedding,X_train_features],np.array(y_train_embedding),
          batch_size=32,
          epochs=5,
          validation_data=([X_test_embedding,X_test_features ],np.array(y_test_features)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6b10574b20>

In [30]:
print("-----Test 3 labels merged model-----")
for features, labels, name in zip([[edu_embedding,edu_features],[fin_embedding,fin_features],[law_embedding,law_features],[medicine_embedding,medicine_features],[mil_embedding,mil_features]],[edu_labels,fin_labels,law_labels,medicine_labels,mil_labels],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc , f1= model_merged_3.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

-----Test 3 labels merged model-----
nyt_edu
Test score: 0.3995703160762787
Test accuracy: 0.8205128312110901
F1 score: 0.8725488781929016
nyt_fin
Test score: 0.3733283281326294
Test accuracy: 0.8454753160476685
F1 score: 0.9060474038124084
nyt_law
Test score: 0.46155232191085815
Test accuracy: 0.7943181991577148
F1 score: 0.859734296798706
nyt_med
Test score: 0.4149516224861145
Test accuracy: 0.8228369355201721
F1 score: 0.8792459964752197
nyt_mil
Test score: 0.38675129413604736
Test accuracy: 0.8556554913520813
F1 score: 0.8970090746879578


# MERged 6 model

In [31]:
def extract_features_6(unsorted_arg_dataset,unsorted_bert_dataset):
    sorted_keys=sorted(unsorted_arg_dataset.keys())
    sorted_bert= []
    sorted_arg = []
    labels=[]
    for key in sorted_keys:
        if key in unsorted_bert_dataset.keys():
            tensor=unsorted_bert_dataset[key]
            sorted_bert.append(tensor)
            sorted_arg.append(tf.convert_to_tensor(unsorted_arg_dataset[key][0]))
            labels.append(unsorted_arg_dataset[key][1])
    return sorted_arg,sorted_bert,labels

In [32]:
#education
edu_features_6=extract_features_6(edu_arg_6,edu_bert)[0]
edu_embedding_6=extract_features_6(edu_arg_6,edu_bert)[1]
edu_labels_6=extract_features_6(edu_arg_6,edu_bert)[2]
#-------------------------
fin_features_6=extract_features_6(fin_arg_6,fin_bert)[0]
fin_embedding_6=extract_features_6(fin_arg_6,fin_bert)[1]
fin_labels_6=extract_features_6(fin_arg_6,fin_bert)[2]
#-------------------------
law_features_6=extract_features_6(law_arg_6,law_bert)[0]
law_embedding_6=extract_features_6(law_arg_6,law_bert)[1]
law_labels_6=extract_features_6(law_arg_6,law_bert)[2]
#-------------------------
medicine_features_6=extract_features_6(med_arg_6,med_bert)[0]
medicine_embedding_6=extract_features_6(med_arg_6,med_bert)[1]
medicine_labels_6=extract_features_6(med_arg_6,med_bert)[2]
#-------------------------
mil_features_6=extract_features_6(mil_arg_6,mil_bert)[0]
mil_embedding_6=extract_features_6(mil_arg_6,mil_bert)[1]
mil_labels_6=extract_features_6(mil_arg_6,mil_bert)[2]
#-------------------------
pol_features_6=extract_features_6(pol_arg_6,pol_bert)[0]
pol_embedding_6=extract_features_6(pol_arg_6,pol_bert)[1]
pol_labels_6=extract_features_6(pol_arg_6,pol_bert)[2]

In [33]:
#6 label merged model
input_emb = Input(shape=(768,))
dense_1 = Dense(128, activation='relu', activity_regularizer=l2(0.0001))(input_emb)
dropout_1 = Dropout(0.5)(dense_1)
# dense_2 = Dense(128, activation='sigmoid', activity_regularizer=l2(0.0001))(input_emb)
# dropout_2 = Dropout(0.5)(dense_1)

input_arg = Input(shape=(100,))
model_arg = Embedding(6, 128)(input_arg)
model_arg = SimpleRNN(128, dropout=0.2)(model_arg)

merged = Concatenate()([dropout_1, model_arg])
dense_pred = (Dense(1, activation='sigmoid'))(merged)

model_6_merged= Model(inputs=[input_emb, input_arg], outputs=dense_pred)
model_6_merged.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_6_merged.summary())

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 dense_5 (Dense)                (None, 128)          98432       ['input_6[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 100, 128)     768         ['input_7[0][0]']                
                                                                                            

In [36]:
#6 label merged model
X_train_embedding_6, X_test_embedding_6, y_train_embedding_6, y_test_embedding_6= train_test_split(pol_embedding_6,pol_labels_6, 
    test_size=0.2, random_state= 8)
X_train_features_6, X_test_features_6, y_train_features_6, y_test_features_6= train_test_split(pol_features_6,pol_labels_6, 
    test_size=0.2, random_state= 8)
#We need to stack the embeddings
X_train_embedding_6=tf.stack(X_train_embedding_6, axis=0)
X_train_features_6=tf.stack(X_train_features_6, axis=0)
X_test_embedding_6=tf.stack(X_test_embedding_6, axis=0)
X_test_features_6=tf.stack(X_test_features_6, axis=0)

In [37]:
#education tranform into a tensor
edu_features_6=tf.stack(edu_features_6, axis=0)
edu_embedding_6=tf.stack(edu_embedding_6, axis=0)
edu_labels_6=np.array(edu_labels_6)
#finances
fin_features_6=tf.stack(fin_features_6, axis=0)
fin_embedding_6=tf.stack(fin_embedding_6, axis=0)
fin_labels_6=np.array(fin_labels_6)
#law
law_features_6=tf.stack(law_features_6, axis=0)
law_embedding_6=tf.stack(law_embedding_6, axis=0)
law_labels_6=np.array(law_labels_6)
#medicine
medicine_features_6=tf.stack(medicine_features_6, axis=0)
medicine_embedding_6=tf.stack(medicine_embedding_6, axis=0)
medicine_labels_6=np.array(medicine_labels_6)
#military
mil_features_6=tf.stack(mil_features_6, axis=0)
mil_embedding_6=tf.stack(mil_embedding_6, axis=0)
mil_labels_6=np.array(mil_labels_6)

In [38]:
#Merged
import numpy as np
model_6_merged.fit([X_train_embedding_6,X_train_features_6],np.array(y_train_embedding_6),
          batch_size=32,
          epochs=5,
          validation_data=([X_test_embedding_6,X_test_features_6],np.array(y_test_features_6)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f67a46987f0>

In [None]:
print("-----Test merged  6 label model-----")
for features, labels, name in zip([[edu_embedding_6,edu_features_6],[fin_embedding_6,fin_features_6],[law_embedding_6,law_features_6],[medicine_embedding_6,medicine_features_6],[mil_embedding_6,mil_features_6]],[edu_labels_6,fin_labels_6,law_labels_6,medicine_labels_6,mil_labels_6],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc , f1= model_6_merged.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

-----Test merged  6 label model-----
nyt_edu
Test score: 0.38896244764328003
Test accuracy: 0.8235936760902405
F1 score: 0.8656309843063354
nyt_fin
Test score: 0.36608582735061646
Test accuracy: 0.8450474143028259
F1 score: 0.9021396040916443
nyt_law
Test score: 0.43332797288894653
Test accuracy: 0.8039772510528564
F1 score: 0.8606530427932739
nyt_med
Test score: 0.40380895137786865
Test accuracy: 0.8175397515296936
F1 score: 0.8715118169784546
nyt_mil
13/67 [====>.........................] - ETA: 0s - loss: 0.3441 - accuracy: 0.8654 - f1_score: 0.9130