# Import argumentative features dataset

In [1]:
import tensorflow as tf
import torch
tf.keras.backend.clear_session()
torch.cuda.empty_cache()
from numba import cuda
cuda.select_device(0)
cuda.close()
import subprocess as sp
import os

def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_gpu_memory()

[11176]

In [2]:
#Import all needed libraries and packages
import pandas as pd
from keras import backend as K
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tensorflow import keras
import torch
from sklearn.model_selection import train_test_split
import nltk
from nltk import word_tokenize
from nltk import StanfordTagger
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle
import os
from ast import literal_eval
from itertools import chain, repeat, islice

# Import original datasets for VADER

In [3]:
#Importing already filtered out datasets from New York Times
nyt_edu_original= list(open("/data/output_txt/nyt-edu.txt"))
print("NYT-EDU original features:",len(nyt_edu_original ))
nyt_fin_original= list(open("/data/output_txt/nyt-fin.txt"))
print("NYT-FIN original features:",len(nyt_fin_original))
nyt_law_original= list(open("/data/output_txt/nyt-law.txt"))
print("NYT-LAW original features:",len(nyt_law_original))
nyt_med_original = list(open("/data/output_txt/nyt-med.txt"))
print("NYT-Med original features:",len(nyt_med_original))
nyt_mil_original= list(open("/data/output_txt/nyt-mil.txt"))
print("NYT-MILL original features:",len(nyt_mil_original))
nyt_pol_original = list(open("/data/output_txt/nyt-pol.txt"))
print("NYT-POL original features:",len(nyt_pol_original))

NYT-EDU original features: 1881
NYT-FIN original features: 3100
NYT-LAW original features: 3553
NYT-Med original features: 1743
NYT-MILL original features: 2132
NYT-POL original features: 6886


# Import BERT embeddings and Arg. Features

In [4]:
#Helper functions
def extract_arg_features(dataset):
    arg_features=[]
    arg_labels=[]
    texts_order=[]
    arg_features_dict={}
    for s in dataset:
        text=s.split("\t")[1].split("\n")[0]
        text=text.replace('"',"")
        tokenized=nltk.sent_tokenize(text)
        if len(tokenized)<=100:
            texts_order.append(text)
            sequence=literal_eval(s.split("\t")[2].split("\n")[0])
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            arg_features.append(sequence)
            arg_labels.append(label)
            arg_features_dict[text]=(sequence,label)
    return arg_features_dict,arg_features,arg_labels,texts_order


def get_bert_embeddings(name):
    directory = '/data/BertEmbeddings/'+str(name)
    # Create empty dictionary to save data
    bert_dict= {}
    # Loop over files and read pickles
    for file in os.listdir(directory):
        if file.endswith('.pickle'):
            with open(str("/data/BertEmbeddings/"+str(name)+"/"+str(file)), 'rb') as f:
                bert_f=pickle.load(f)
                article= bert_f['article']
                article=article.replace('"',"")
                tokenized=nltk.sent_tokenize(article)
                if len(tokenized)<=100:
                    bert_emb= bert_f['bert_emb'][0]
                    bert_dict[article] =tf.convert_to_tensor(bert_emb.cpu())
    return bert_dict

def extract_features(unsorted_arg_dataset,unsorted_bert_dataset,texts_order_list):
    sorted_keys=texts_order_list
    sorted_bert= []
    sorted_arg = []
    labels=[]
    h=[]
    for key in sorted_keys:
        h.append(key)
        tensor=unsorted_bert_dataset[key]
        sorted_bert.append(tensor)
        if key in unsorted_arg_dataset:
            sorted_arg.append(tf.convert_to_tensor(unsorted_arg_dataset[key][0]))
            labels.append(unsorted_arg_dataset[key][1])
    return sorted_arg,sorted_bert,labels,h

In [5]:
#Argumentative features.3 labels
nyt_edu_arg = list(open("/data/output_txt/nyt-edu-argfeat.txt"))
edu_arg_3,edu_arg_3_features,edu_arg_3_labels, edu_order=extract_arg_features(nyt_edu_arg)
print("NYT-EDU argumentative features:",len(edu_arg_3))
nyt_fin_arg = list(open("/data/output_txt/nyt-fin-argfeat.txt"))
fin_arg_3,fin_arg_3_features,fin_arg_3_labels, fin_order=extract_arg_features(nyt_fin_arg)
print("NYT-FIN argumentative features:",len(fin_arg_3))
nyt_law_arg = list(open("/data/output_txt/nyt-law-argfeat.txt"))
law_arg_3,law_arg_3_features,law_arg_3_labels, law_order=extract_arg_features(nyt_law_arg)
print("NYfT-LAW argumentative features:",len(law_arg_3))
nyt_med_arg = list(open("/data/output_txt/nyt-med-argfeat.txt"))
med_arg_3,med_arg_3_features,med_arg_3_labels,med_order=extract_arg_features(nyt_med_arg)
print("NYT-Med argumentative features:",len(med_arg_3))
nyt_mil_arg = list(open("/data/output_txt/nyt-mil-argfeat.txt"))
mil_arg_3,mil_arg_3_features,mil_arg_3_labels, mil_order=extract_arg_features(nyt_mil_arg)
print("NYT-MILL argumentative features:",len(mil_arg_3))
nyt_pol_arg = list(open("/data/output_txt/nyt-pol-argfeat.txt"))
pol_arg_3,pol_arg_3_features,pol_arg_3_labels, pol_order=extract_arg_features(nyt_pol_arg)
print("NYT-POL argumentative features:",len(pol_arg_3))

NYT-EDU argumentative features: 1833
NYT-FIN argumentative features: 3061
NYfT-LAW argumentative features: 3520
NYT-Med argumentative features: 1699
NYT-MILL argumentative features: 2113
NYT-POL argumentative features: 6826


In [6]:
#Argumentative features.6 labels
print("NYT publishers and dataset size of argumentative features with 3 labels:'assumption','anecdote','testimony','statistics','other','common-ground'")
nyt_edu_6_arg = list(open("nyt-edu-argfeat-6.txt"))
edu_arg_6,edu_arg_6_features,edu_arg_6_labels, edu_order_6=extract_arg_features(nyt_edu_6_arg)
print("NYT-EDU arg features:",len(edu_arg_6))
nyt_fin_6_arg = list(open("/data/output_txt/nyt-fin-argfeat-6.txt"))
fin_arg_6,fin_arg_6_features,fin_arg_6_labels, fin_order_6=extract_arg_features(nyt_fin_6_arg)
print("NYT-FIN arg features:",len(fin_arg_6))
nyt_law_6_arg = list(open("/data/output_txt/nyt-law-argfeat-6.txt"))
law_arg_6,law_arg_6_features,law_arg_6_labels, law_order_6=extract_arg_features(nyt_law_6_arg)
print("NYT-LAW arg features:",len(law_arg_6))
nyt_med_6_arg = list(open("/data/output_txt/nyt-med-argfeat-6.txt"))
med_arg_6,med_arg_6_features,med_arg_6_labels,med_order_6=extract_arg_features(nyt_med_6_arg)
print("NYT-Med arg features:",len(med_arg_6))
nyt_mil_6_arg = list(open("/data/output_txt/nyt-mil-argfeat-6.txt"))
mil_arg_6,mil_arg_6_features,mil_arg_6_labels, mil_order_6=extract_arg_features(nyt_mil_6_arg)
print("NYT-MILL arg features:",len(mil_arg_6))
nyt_pol_6_arg = list(open("/data/output_txt/nyt-pol-argfeat-6.txt"))
pol_arg_6,pol_arg_6_features,pol_arg_6_labels, pol_order_6=extract_arg_features(nyt_pol_6_arg)
print("NYT-POL arg features:",len(pol_arg_6))

NYT publishers and dataset size of argumentative features with 3 labels:'assumption','anecdote','testimony','statistics','other','common-ground'
NYT-EDU arg features: 1831
NYT-FIN arg features: 3059
NYT-LAW arg features: 3520
NYT-Med arg features: 1699
NYT-MILL arg features: 2113
NYT-POL arg features: 6826


In [7]:
#education
edu_bert=get_bert_embeddings("NytEduBert")
print("Length Arg. Feature=Embeddings:","Education",len(edu_arg_3)==len(edu_bert))
#finances
fin_bert=get_bert_embeddings("NytFinBert")
print("Length Arg. Feature=Embeddings:","Finances-",len(fin_arg_3)==len(fin_bert))
#law 
law_bert=get_bert_embeddings("NytLawBert")
print("Length Arg. Feature=Embeddings:","Law-",len(law_arg_3)==len(law_bert))
#medicine 
med_bert=get_bert_embeddings("NytMedBert")
print("Length Arg. Feature=Embeddings:","Medicine-",len(med_arg_3)==len(med_bert))
#military
mil_bert=get_bert_embeddings("NytMillBert")
print("Length Arg. Feature=Embeddings:","Military-",len(mil_arg_3)==len(mil_bert))
#politics
pol_bert=get_bert_embeddings("NytPolBert")
print("Length Arg. Feature=Embeddings:","Politics-",len(pol_arg_3)==len(pol_bert))

2023-03-28 08:31:05.358949: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-28 08:31:05.884610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9745 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:83:00.0, compute capability: 6.1


Length Arg. Feature=Embeddings: Education True
Length Arg. Feature=Embeddings: Finances- True
Length Arg. Feature=Embeddings: Law- True
Length Arg. Feature=Embeddings: Medicine- True
Length Arg. Feature=Embeddings: Military- True
Length Arg. Feature=Embeddings: Politics- True


In [8]:
"NYT publishers and dataset size of argumentative features with 3 labels:Claim, Premises,None"
#education
edu_features=extract_features(edu_arg_3,edu_bert,edu_order)[0]
edu_embedding=extract_features(edu_arg_3,edu_bert,edu_order)[1]
edu_labels=extract_features(edu_arg_3,edu_bert,edu_order)[2]
edu_list=extract_features(edu_arg_3,edu_bert,edu_order)[3]
#tranform into a tensor
edu_features=tf.stack(edu_features, axis=0)
edu_embedding=tf.stack(edu_embedding, axis=0)
edu_labels=np.array(edu_labels)

#finances
fin_features=extract_features(fin_arg_3,fin_bert,fin_order)[0]
fin_embedding=extract_features(fin_arg_3,fin_bert,fin_order)[1]
fin_labels=extract_features(fin_arg_3,fin_bert,fin_order)[2]
fin_list=extract_features(fin_arg_3,fin_bert,fin_order)[3]
#tranform into a tensor
fin_features=tf.stack(fin_features, axis=0)
fin_embedding=tf.stack(fin_embedding, axis=0)
fin_labels=np.array(fin_labels)

#law
law_features=extract_features(law_arg_3,law_bert,law_order)[0]
law_embedding=extract_features(law_arg_3,law_bert,law_order)[1]
law_labels=extract_features(law_arg_3,law_bert,law_order)[2]
#tranform into a tensor
law_features=tf.stack(law_features, axis=0)
law_embedding=tf.stack(law_embedding, axis=0)
law_labels=np.array(law_labels)

#medicine
medicine_features=extract_features(med_arg_3,med_bert,med_order)[0]
medicine_embedding=extract_features(med_arg_3,med_bert,med_order)[1]
medicine_labels=extract_features(med_arg_3,med_bert,med_order)[2]
#tranform into a tensor
medicine_features=tf.stack(medicine_features, axis=0)
medicine_embedding=tf.stack(medicine_embedding, axis=0)
medicine_labels=np.array(medicine_labels)

#military
mil_features=extract_features(mil_arg_3,mil_bert,mil_order)[0]
mil_embedding=extract_features(mil_arg_3,mil_bert,mil_order)[1]
mil_labels=extract_features(mil_arg_3,mil_bert,mil_order)[2]
#tranform into a tensor
mil_features=tf.stack(mil_features, axis=0)
mil_embedding=tf.stack(mil_embedding, axis=0)
mil_labels=np.array(mil_labels)
#politics
pol_features=extract_features(pol_arg_3,pol_bert,pol_order)[0]
pol_embedding=extract_features(pol_arg_3,pol_bert,pol_order)[1]
pol_labels=extract_features(pol_arg_3,pol_bert,pol_order)[2]

In [9]:
#education
edu_features_6=extract_features(edu_arg_6,edu_bert,edu_order_6)[0]
edu_embedding_6=extract_features(edu_arg_6,edu_bert,edu_order_6)[1]
edu_labels_6=extract_features(edu_arg_6,edu_bert,edu_order_6)[2]
#tranform into a tensor
edu_features_6=tf.stack(edu_features_6, axis=0)
edu_embedding_6=tf.stack(edu_embedding_6, axis=0)
edu_labels_6=np.array(edu_labels_6)

#finances
fin_features_6=extract_features(fin_arg_6,fin_bert,fin_order_6)[0]
fin_embedding_6=extract_features(fin_arg_6,fin_bert,fin_order_6)[1]
fin_labels_6=extract_features(fin_arg_6,fin_bert,fin_order_6)[2]
#tranform into a tensor
fin_features_6=tf.stack(fin_features_6, axis=0)
fin_embedding_6=tf.stack(fin_embedding_6, axis=0)
fin_labels_6=np.array(fin_labels_6)

#law
law_features_6=extract_features(law_arg_6,law_bert,law_order_6)[0]
law_embedding_6=extract_features(law_arg_6,law_bert,law_order_6)[1]
law_labels_6=extract_features(law_arg_6,law_bert,law_order_6)[2]
#tranform into a tensor
law_features_6=tf.stack(law_features_6, axis=0)
law_embedding_6=tf.stack(law_embedding_6, axis=0)
law_labels_6=np.array(law_labels_6)

#medicine
medicine_features_6=extract_features(med_arg_6,med_bert,med_order_6)[0]
medicine_embedding_6=extract_features(med_arg_6,med_bert,med_order_6)[1]
medicine_labels_6=extract_features(med_arg_6,med_bert,med_order_6)[2]
#tranform into a tensor
medicine_features_6=tf.stack(medicine_features_6, axis=0)
medicine_embedding_6=tf.stack(medicine_embedding_6, axis=0)
medicine_labels_6=np.array(medicine_labels_6)

#military
mil_features_6=extract_features(mil_arg_6,mil_bert,mil_order_6)[0]
mil_embedding_6=extract_features(mil_arg_6,mil_bert,mil_order_6)[1]
mil_labels_6=extract_features(mil_arg_6,mil_bert,mil_order_6)[2]
#tranform into a tensor
mil_features_6=tf.stack(mil_features_6, axis=0)
mil_embedding_6=tf.stack(mil_embedding_6, axis=0)
mil_labels_6=np.array(mil_labels_6)
#politics
pol_features_6=extract_features(pol_arg_6,pol_bert,pol_order_6)[0]
pol_embedding_6=extract_features(pol_arg_6,pol_bert,pol_order_6)[1]
pol_labels_6=extract_features(pol_arg_6,pol_bert,pol_order_6)[2]

# VADER for sentiment analysis.

In [10]:
#All helper  functions needeed for implementing the Vader model
sid = SentimentIntensityAnalyzer()
#Vader gives a dictionary as output we need to transfer it to a integer
def format_output(output_dict):
    if(output_dict['compound']>= 0.05):
        polarity =1 #positive=1
    elif(output_dict['compound']<= -0.05):
        polarity =2 #negative=2
    else:
        polarity =3 #neural=3
    return polarity

def predict_sentiment(text):
    output_dict =  sid.polarity_scores(text)
    return format_output(output_dict)

#As the number of sentences are different we need to pad to 100 sentences at maximun.
def pad_infinite(iterable, padding=None):
    return chain(iterable, repeat(padding))

def pad(iterable, size, padding=None):
    return list(islice(pad_infinite(iterable, padding), size))

#The main function of sentiment extraction.
def sentiment_dataset_extraction(example_dataset):
    main_label_list=[]
    sentiment_labels_list=[]
    for s in example_dataset:
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text) # this gives us a list of sentences
        if len(split_text)<=100:
            text_sentiment=[]
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            for sent in split_text:
                sentence_sentiment=predict_sentiment(sent)
                text_sentiment.append(sentence_sentiment)
            padded_sentence=pad(text_sentiment,100,0)
            sentiment_labels_list.append(padded_sentence)
            main_label_list.append(label)
    return sentiment_labels_list,main_label_list

In [11]:
#Extracting the sentiment fromt the dataset
#First variable is list with sentiment tensors and the second variable is a list with labels.
edu_sentiments,edu_labels_sentiments=sentiment_dataset_extraction(nyt_edu_original)
edu_sentiments=np.array(edu_sentiments)
fin_sentiments,fin_labels_sentiments=sentiment_dataset_extraction(nyt_fin_original)
fin_sentiments=np.array(fin_sentiments)
law_sentiments,law_labels_sentiments=sentiment_dataset_extraction(nyt_law_original)
law_sentiments=np.array(law_sentiments)
med_sentiments,med_labels_sentiments=sentiment_dataset_extraction(nyt_med_original)
med_sentiments=np.array(med_sentiments)
mil_sentiments,mil_labels_sentiments=sentiment_dataset_extraction(nyt_mil_original)
mil_sentiments=np.array(mil_sentiments)
pol_sentiments,pol_labels_sentiments=sentiment_dataset_extraction(nyt_pol_original)
pol_sentiments=np.array(pol_sentiments)

In [12]:
#For 6 label 
banned_edu=set(edu_order).difference(set(edu_order_6))
banned_fin=set(fin_order).difference(set(fin_order_6))
all_banned=banned_edu.union(banned_fin)
def sentiment_dataset_extraction_6(example_dataset):
    main_label_list=[]
    sentiment_labels_list=[]
    for s in example_dataset:
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text) # this gives us a list of sentences
        if len(split_text)<=100:
            text=text.replace('"',"")
            if text not in all_banned:
                text_sentiment=[]
                label=s.split("\t")[0]
                if label=='editorial':
                    label=0
                elif label=='news':
                    label=1
                for sent in split_text:
                    sentence_sentiment=predict_sentiment(sent)
                    text_sentiment.append(sentence_sentiment)
                padded_sentence=pad(text_sentiment,100,0)
                sentiment_labels_list.append(padded_sentence)
                main_label_list.append(label)
    return sentiment_labels_list,main_label_list

In [13]:
edu_sentiments_6,edu_labels_sentiments_6=sentiment_dataset_extraction_6(nyt_edu_original)
edu_sentiments_6=np.array(edu_sentiments_6)
fin_sentiments_6,fin_labels_sentiments_6=sentiment_dataset_extraction_6(nyt_fin_original)
fin_sentiments_6=np.array(fin_sentiments_6)
law_sentiments_6,law_labels_sentiments_6=sentiment_dataset_extraction_6(nyt_law_original)
law_sentiments_6=np.array(law_sentiments_6)
med_sentiments_6,med_labels_sentiments_6=sentiment_dataset_extraction_6(nyt_med_original)
med_sentiments_6=np.array(med_sentiments_6)
mil_sentiments_6,mil_labels_sentiments_6=sentiment_dataset_extraction_6(nyt_mil_original)
mil_sentiments_6=np.array(mil_sentiments_6)
pol_sentiments_6,pol_labels_sentiments_6=sentiment_dataset_extraction_6(nyt_pol_original)
pol_sentiments_6=np.array(pol_sentiments_6)

# Stanford POS model

In [14]:
#The English Penn Treebank tagset is used for Stanford POS model,we need to tranfser them to numbers.
tokenizer = RegexpTokenizer(r'\w+')
pos_tags={ "CC":1,"CD":2,"DT":3,"EX":4,"FW":5,
"IN":6,"JJ":7,"JJR":8,"JJS":9,"LS":10,"MD":11,
"NN":12,"NNS":13,"NNP":14,"NNPS":15,"PDT":16,"POS":17,
"PRP":18,"PRP$":19,"RB":20,"RBR":21,"RBS":22,"RP":23,
"SYM":24,"TO":25,"UH":26,"VB":27,"VBD":28,"VBG":29,
"VBN":30,"VBP":31,"VBZ":32,"WDT":33,"WP":34,"WP$":35,"WRB":36}

#How many documents there are that contain more than 100 sentences in one document?
def check_sentence_length(example_sets):
    check_lemmas={}
    for example_set in example_sets:
        for s in example_set:
            pos_list_text=[]
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            text=s.split("\t")[1].split("\n")[0]
            split_text = nltk.sent_tokenize(text) # this gives us a list of sentences
            if len(split_text) not in check_lemmas:
                check_lemmas[len(split_text)]=0
            check_lemmas[len(split_text)]+=1
    return check_lemmas

#Imply the model
sentence_length_dict=check_sentence_length((nyt_edu_original,nyt_fin_original,nyt_law_original,nyt_med_original,nyt_mil_original,nyt_pol_original))
count_more_100=0
general_number_documents=sum(sentence_length_dict.values())
for k,v in sentence_length_dict.items():
    if k>100:
        count_more_100+=v
percentage_less_100=(count_more_100/general_number_documents)*100
print("What is the percentage of documents in all documents that have more than 100 sentences?",percentage_less_100)
#This means that more than 98% of the whole dataset contains of sentences with length less than 100.

What is the percentage of documents in all documents that have more than 100 sentences? 1.2593936252915263


In [17]:
listofzeros = [0] *100
tokenizer = RegexpTokenizer(r'\w+')

def pos_tagger(pos_example_dataset):
    pos_data=[]
    pos_label=[]
    for s in pos_example_dataset:
        pos_list_text=[]
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text)
        if len(split_text)<=100:
            for sent in split_text:
                pos_list_sentence=[]
                pos_analyzed=nltk.pos_tag(tokenizer.tokenize(sent))
                for word in pos_analyzed:
                    try:
                        pos_list_sentence.append(pos_tags[word[1]])
                    except:
                        pass
                pos_list_sentence=pad(pos_list_sentence,100,0)
                pos_list_text.append(pos_list_sentence)
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            if len(pos_list_text)<=100:
                pos_list_text=pad(pos_list_text,100,listofzeros)
            pos_tensor=tf.convert_to_tensor(pos_list_text)
            pos_data.append(pos_tensor)
            pos_label.append(label)
    return pos_data,pos_label
def pos_tagger_6(pos_example_dataset):
    pos_data=[]
    pos_label=[]
    for s in pos_example_dataset:
        pos_list_text=[]
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text)
        if len(split_text)<=100:
            for sent in split_text:
                pos_list_sentence=[]
                pos_analyzed=nltk.pos_tag(tokenizer.tokenize(sent))
                for word in pos_analyzed:
                    try:
                        pos_list_sentence.append(pos_tags[word[1]])
                    except:
                        pass
                pos_list_sentence=pad(pos_list_sentence,100,0)
                pos_list_text.append(pos_list_sentence)
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            if len(pos_list_text)<=100:
                pos_list_text=pad(pos_list_text,100,listofzeros)
            pos_tensor=tf.convert_to_tensor(pos_list_text)
            pos_data.append(pos_tensor)
            pos_label.append(label)
    return pos_data,pos_label

In [18]:
edu_pos_data,edu_pos_labels=pos_tagger(nyt_edu_original)
edu_pos_data=np.array(edu_pos_data)
fin_pos_data,fin_pos_labels=pos_tagger(nyt_fin_original)
fin_pos_data=np.array(fin_pos_data)
law_pos_data,law_pos_labels=pos_tagger(nyt_law_original)
law_pos_data=np.array(law_pos_data)
med_pos_data,med_pos_labels=pos_tagger(nyt_med_original)
med_pos_data=np.array(med_pos_data)
mil_pos_data,mil_pos_labels=pos_tagger(nyt_mil_original)
mil_pos_data=np.array(mil_pos_data)
pol_pos_data,pol_pos_labels=pos_tagger(nyt_pol_original)
pol_pos_data=np.array(pol_pos_data)

In [19]:
pol_pos_labels==pol_labels_sentiments==pol_labels

True

In [20]:
#For 6 label 
def pos_tagger_6(pos_example_dataset):
    pos_data=[]
    pos_label=[]
    for s in pos_example_dataset:
        pos_list_text=[]
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text)
        if len(split_text)<=100:
            text=text.replace('"',"")
            if text not in all_banned:
                for sent in split_text:
                    pos_list_sentence=[]
                    pos_analyzed=nltk.pos_tag(tokenizer.tokenize(sent))
                    for word in pos_analyzed:
                        try:
                            pos_list_sentence.append(pos_tags[word[1]])
                        except:
                            pass
                    pos_list_sentence=pad(pos_list_sentence,100,0)
                    pos_list_text.append(pos_list_sentence)
                label=s.split("\t")[0]
                if label=='editorial':
                    label=0
                elif label=='news':
                    label=1
                if len(pos_list_text)<=100:
                    pos_list_text=pad(pos_list_text,100,listofzeros)
                pos_tensor=tf.convert_to_tensor(pos_list_text)
                pos_data.append(pos_tensor)
                pos_label.append(label)
    return pos_data,pos_label

In [21]:
edu_pos_data_6,edu_pos_labels_6=pos_tagger_6(nyt_edu_original)
edu_pos_data_6=np.array(edu_pos_data_6)
fin_pos_data_6,fin_pos_labels_6=pos_tagger_6(nyt_fin_original)
fin_pos_data_6=np.array(fin_pos_data_6)
law_pos_data_6,law_pos_labels_6=pos_tagger_6(nyt_law_original)
law_pos_data_6=np.array(law_pos_data_6)
med_pos_data_6,med_pos_labels_6=pos_tagger_6(nyt_med_original)
med_pos_data_6=np.array(med_pos_data_6)
mil_pos_data_6,mil_pos_labels_6=pos_tagger_6(nyt_mil_original)
mil_pos_data_6=np.array(mil_pos_data_6)
pol_pos_data_6,pol_pos_labels_6=pos_tagger_6(nyt_pol_original)
pol_pos_data_6=np.array(pol_pos_data_6)

In [22]:
fin_pos_labels_6==fin_labels_sentiments_6==fin_labels_6

array([ True,  True,  True, ...,  True,  True,  True])

# Merging  4 models: VADER,POS,BERT,Arg. Features.3 labels

In [23]:
from keras import Input, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Concatenate, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.layers import Bidirectional, InputLayer, SimpleRNN
from keras.constraints import maxnorm
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint
from itertools import chain, repeat, islice

In [24]:
def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [25]:
#The architecture of  BERT+Arg.Feature+VADER+POS merged
input_emb = Input(shape=(768,))
dense_1 = Dense(128, activation='relu', activity_regularizer=l2(0.0001))(input_emb)
dropout_1 = Dropout(0.5)(dense_1)

input_arg = Input(shape=(100,))
model_arg = Embedding(3, 128)(input_arg)
model_arg = SimpleRNN(128, dropout=0.2)(model_arg)

input_sentiment = Input(shape=(100,))
model_sentiment = Embedding(3, 128)(input_sentiment)
model_sentiment = SimpleRNN(128, dropout=0.2)(model_sentiment)

input_pos= Input(pol_pos_data[0].shape)
model_pos= SimpleRNN(128, dropout=0.1)(input_pos)


merged = Concatenate()([dropout_1, model_arg,model_sentiment,model_pos])
dense_pred = (Dense(1, activation='sigmoid'))(merged)

model_all_merged= Model(inputs=[input_emb, input_arg,input_sentiment,input_pos], outputs=dense_pred)
model_all_merged.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_all_merged.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 dense (Dense)                  (None, 128)          98432       ['input_1[0][0]']                
                                                                                              

In [26]:
#embedding
X_train_embedding, X_test_embedding, y_train_embedding, y_test_embedding= train_test_split(pol_embedding,pol_labels, 
    test_size=0.2, random_state= 42)
X_train_embedding=tf.stack(X_train_embedding, axis=0)                          
X_test_embedding=tf.stack(X_test_embedding, axis=0)
#features
X_train_features, X_test_features, y_train_features, y_test_features= train_test_split(pol_features,pol_labels, 
    test_size=0.2, random_state= 42)
X_train_features=tf.stack(X_train_features, axis=0)
X_test_features=tf.stack(X_test_features, axis=0)

#POS
X_train_pos, X_test_pos, y_train_pos, y_test_pos= train_test_split(pol_pos_data,pol_pos_labels, 
    test_size=0.2, random_state= 42)
X_train_pos=tf.stack(X_train_pos, axis=0)
X_test_pos=tf.stack(X_test_pos, axis=0)

#Sentiment
X_train_sent, X_test_sent, y_train_sent, y_test_sent= train_test_split(pol_sentiments, pol_labels, 
    test_size=0.2, random_state= 42)
X_train_sent=tf.stack(X_train_sent, axis=0)
X_test_sent=tf.stack(X_test_sent, axis=0)

In [27]:
#Merged
import numpy as np
model_all_merged.fit([X_train_embedding,X_train_features,X_train_sent,X_train_pos],np.array(y_train_embedding),
          batch_size=32,
          epochs=5,
          validation_data=([X_test_embedding,X_test_features,X_test_sent,X_test_pos],np.array(y_test_features)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe61c8b26d0>

In [28]:
for features, labels, name in zip([[edu_embedding,edu_features,edu_sentiments,edu_pos_data],[fin_embedding,fin_features,fin_sentiments,fin_pos_data],[law_embedding,law_features,law_sentiments,law_pos_data],[medicine_embedding,medicine_features,med_sentiments,med_pos_data],[mil_embedding,mil_features,mil_sentiments,mil_pos_data]],[edu_labels,fin_labels,law_labels,medicine_labels,mil_labels],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc , f1= model_all_merged.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

nyt_edu
Test score: 0.39559465646743774
Test accuracy: 0.8166939616203308
F1 score: 0.8807132840156555
nyt_fin
Test score: 0.3657127320766449
Test accuracy: 0.8497223258018494
F1 score: 0.9121111035346985
nyt_law
Test score: 0.45738667249679565
Test accuracy: 0.7940340638160706
F1 score: 0.8695864081382751
nyt_med
Test score: 0.41425877809524536
Test accuracy: 0.820482611656189
F1 score: 0.8841903209686279
nyt_mil
Test score: 0.37382471561431885
Test accuracy: 0.8551822304725647
F1 score: 0.913189709186554


#  6 labels

In [29]:
#The architecture of  BERT+Arg.Feature+VADER+POS merged
input_emb = Input(shape=(768,))
dense_1 = Dense(128, activation='relu', activity_regularizer=l2(0.0001))(input_emb)
dropout_1 = Dropout(0.5)(dense_1)

input_arg = Input(shape=(100,))
model_arg = Embedding(6, 128)(input_arg)
model_arg = SimpleRNN(128, dropout=0.2)(model_arg)

input_sentiment = Input(shape=(100,))
model_sentiment = Embedding(3, 128)(input_sentiment)
model_sentiment = SimpleRNN(128, dropout=0.2)(model_sentiment)

input_pos= Input(pol_pos_data_6[0].shape)
model_pos= SimpleRNN(128, dropout=0.1)(input_pos)


merged = Concatenate()([dropout_1, model_arg,model_sentiment,model_pos])
dense_pred = (Dense(1, activation='sigmoid'))(merged)

model_all_merged_6= Model(inputs=[input_emb, input_arg,input_sentiment,input_pos], outputs=dense_pred)
model_all_merged_6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_all_merged_6.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 dense_2 (Dense)                (None, 128)          98432       ['input_5[0][0]']                
                                                                                            

In [30]:
#embedding
X_train_6_embedding, X_test_6_embedding, y_train_6_embedding, y_test_6_embedding= train_test_split(pol_embedding_6,pol_labels_6, 
    test_size=0.2, random_state= 42)
X_train_6_embedding=tf.stack(X_train_6_embedding, axis=0)                          
X_test_6_embedding=tf.stack(X_test_6_embedding, axis=0)
#features
X_train_6_features, X_test_6_features, y_train_6_features, y_test_6_features= train_test_split(pol_features_6,pol_labels_6, 
    test_size=0.2, random_state= 42)
X_train_6_features=tf.stack(X_train_6_features, axis=0)
X_test_6_features=tf.stack(X_test_6_features, axis=0)

#POS
X_train_6_pos, X_test_6_pos, y_train_6_pos, y_test_6_pos= train_test_split(pol_pos_data_6,pol_pos_labels_6, 
    test_size=0.2, random_state= 42)
X_train_6_pos=tf.stack(X_train_6_pos, axis=0)
X_test_6_pos=tf.stack(X_test_6_pos, axis=0)

#Sentiment
X_train_6_sent, X_test_6_sent, y_train_6_sent, y_test_6_sent= train_test_split(pol_sentiments_6, pol_labels_6, 
    test_size=0.2, random_state= 42)
X_train_6_sent=tf.stack(X_train_6_sent, axis=0)
X_test_6_sent=tf.stack(X_test_6_sent, axis=0)
#Merged
import numpy as np
model_all_merged_6.fit([X_train_6_embedding,X_train_6_features,X_train_6_sent,X_train_6_pos],np.array(y_train_6_embedding),
          batch_size=32,
          epochs=5,
          validation_data=([X_test_6_embedding,X_test_6_features,X_test_6_sent,X_test_6_pos],np.array(y_test_6_features)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe338273fa0>

In [31]:
print("Test 6 labels merged model")
for features, labels, name in zip([[edu_embedding_6,edu_features_6,edu_sentiments_6,edu_pos_data_6],[fin_embedding_6,fin_features_6,fin_sentiments_6,fin_pos_data_6],[law_embedding_6,law_features_6,law_sentiments_6,law_pos_data_6],[medicine_embedding_6,medicine_features_6,med_sentiments_6,med_pos_data_6],[mil_embedding_6,mil_features_6,mil_sentiments_6,mil_pos_data_6]],[edu_labels_6,fin_labels_6,law_labels_6,medicine_labels_6,mil_labels_6],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc , f1= model_all_merged_6.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

Test 6 labels merged model
nyt_edu
Test score: 0.39236950874328613
Test accuracy: 0.8246859908103943
F1 score: 0.8850213885307312
nyt_fin
Test score: 0.36195164918899536
Test accuracy: 0.850277841091156
F1 score: 0.9116807579994202
nyt_law
Test score: 0.45051947236061096
Test accuracy: 0.8011363744735718
F1 score: 0.8729512095451355
nyt_med
Test score: 0.41269487142562866
Test accuracy: 0.8193054795265198
F1 score: 0.8817641139030457
nyt_mil
Test score: 0.35945335030555725
Test accuracy: 0.8632276654243469
F1 score: 0.9173408150672913
