# Importing the data

In [1]:
#Import all needed libraries and packages
import pandas as pd
from keras import backend as K
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tensorflow import keras
import torch
import tensorflow as tf
from sklearn.model_selection import train_test_split
import nltk
from nltk import word_tokenize
from nltk import StanfordTagger
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras import Input, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Concatenate, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.layers import Bidirectional, InputLayer, SimpleRNN
from keras.constraints import maxnorm
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint
from itertools import chain, repeat, islice

In [2]:
#Importing already filtered out datasets from New York Times
nyt_edu_original= list(open("/data/output_txt/nyt-edu.txt"))
print("NYT-EDU original features:",len(nyt_edu_original ))
nyt_fin_original= list(open("/data/output_txt/nyt-fin.txt"))
print("NYT-FIN original features:",len(nyt_fin_original))
nyt_law_original= list(open("/data/output_txt/nyt-law.txt"))
print("NYT-LAW original features:",len(nyt_law_original))
nyt_med_original = list(open("/data/output_txt/nyt-med.txt"))
print("NYT-Med original features:",len(nyt_med_original))
nyt_mil_original= list(open("/data/output_txt/nyt-mil.txt"))
print("NYT-MILL original features:",len(nyt_mil_original))
nyt_pol_original = list(open("/data/output_txt/nyt-pol.txt"))
print("NYT-POL original features:",len(nyt_pol_original))

NYT-EDU original features: 1881
NYT-FIN original features: 3100
NYT-LAW original features: 3553
NYT-Med original features: 1743
NYT-MILL original features: 2132
NYT-POL original features: 6886


# VADER for sentiment analysis.Testing model itself

In [11]:
def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [8]:
#All helper  functions needeed for implementing the Vader model
sid = SentimentIntensityAnalyzer()
#Vader gives a dictionary as output we need to transfer it to a integer
def format_output(output_dict):
    if(output_dict['compound']>= 0.05):
        polarity =1 #positive=1
    elif(output_dict['compound']<= -0.05):
        polarity =2 #negative=2
    else:
        polarity =3 #neural=3
    return polarity

def predict_sentiment(text):
    output_dict =  sid.polarity_scores(text)
    return format_output(output_dict)

#As the number of sentences are different we need to pad to 100 sentences at maximun.
def pad_infinite(iterable, padding=None):
    return chain(iterable, repeat(padding))

def pad(iterable, size, padding=None):
    return list(islice(pad_infinite(iterable, padding), size))

#The main function of sentiment extraction.
def sentiment_dataset_extraction(example_dataset):
    main_label_list=[]
    sentiment_labels_list=[]
    for s in example_dataset:
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text) # this gives us a list of sentences
        if len(split_text)<=100:
            text_sentiment=[]
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            for sent in split_text:
                sentence_sentiment=predict_sentiment(sent)
                text_sentiment.append(sentence_sentiment)
            padded_sentence=pad(text_sentiment,100,0)
            sentiment_labels_list.append(padded_sentence)
            main_label_list.append(label)
    return sentiment_labels_list,main_label_list

In [16]:
#Extracting the sentiment fromt the dataset
#First variable is list with sentiment tensors and the second variable is a list with labels.
edu_sentiments,edu_labels=sentiment_dataset_extraction(nyt_edu_original)
fin_sentiments,fin_labels=sentiment_dataset_extraction(nyt_fin_original)
law_sentiments,law_labels=sentiment_dataset_extraction(nyt_law_original)
med_sentiments,med_labels=sentiment_dataset_extraction(nyt_med_original)
mil_sentiments,mil_labels=sentiment_dataset_extraction(nyt_mil_original)
pol_sentiments,pol_labels=sentiment_dataset_extraction(nyt_pol_original)

In [32]:
#The architecture of Sentiment analysis model.Binary classification.
input_arg = Input(shape=(100,))
model_arg = Embedding(3, 128)(input_arg)
model_arg = SimpleRNN(128, dropout=0.5)(model_arg)
dense_pred = (Dense(1, activation='sigmoid'))(model_arg)
model_sentiment= Model(inputs=input_arg, outputs=dense_pred)
model_sentiment.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_sentiment.summary())

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 100, 128)          384       
                                                                 
 simple_rnn_7 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 33,409
Trainable params: 33,409
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
#Validation data extraction.Traning data: NYT politics dataset.
X_train_sent, X_test_sent, y_train_sent, y_test_sent= train_test_split(pol_sentiments, pol_labels, 
    test_size=0.2, random_state= 42)
#Fit the model to the actual data.
model_sentiment.fit(np.array(X_train_sent),np.array(y_train_sent),
          batch_size=32,
          epochs=5,
          validation_data=(np.array(X_test_sent),np.array(y_test_sent)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5730fc9790>

In [34]:
#Test on NYT education, NYT finances ,NYT law,NYT medicine ,NYT military
for features, labels, name in zip([np.array(edu_sentiments),np.array(fin_sentiments),np.array(law_sentiments),np.array(med_sentiments),np.array(mil_sentiments)],[np.array(edu_labels),np.array(fin_labels),np.array(law_labels),np.array(med_labels),np.array(mil_labels)],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc,f1= model_sentiment.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

nyt_edu
Test score: 0.5881454348564148
Test accuracy: 0.7261320352554321
F1 score: 0.8364138603210449
nyt_fin
Test score: 0.5047732591629028
Test accuracy: 0.8235870599746704
F1 score: 0.9012267589569092
nyt_law
Test score: 0.5802714228630066
Test accuracy: 0.7355113625526428
F1 score: 0.8446060419082642
nyt_med
Test score: 0.5796002149581909
Test accuracy: 0.7357268929481506
F1 score: 0.8434920907020569
nyt_mil
Test score: 0.5287598967552185
Test accuracy: 0.7955513596534729
F1 score: 0.8845686912536621


# Stanford POS model

In [3]:
#The English Penn Treebank tagset is used for Stanford POS model,we need to tranfser them to numbers.
tokenizer = RegexpTokenizer(r'\w+')
pos_tags={ "CC":1,"CD":2,"DT":3,"EX":4,"FW":5,
"IN":6,"JJ":7,"JJR":8,"JJS":9,"LS":10,"MD":11,
"NN":12,"NNS":13,"NNP":14,"NNPS":15,"PDT":16,"POS":17,
"PRP":18,"PRP$":19,"RB":20,"RBR":21,"RBS":22,"RP":23,
"SYM":24,"TO":25,"UH":26,"VB":27,"VBD":28,"VBG":29,
"VBN":30,"VBP":31,"VBZ":32,"WDT":33,"WP":34,"WP$":35,"WRB":36}

In [4]:
#How many documents there are that contain more than 100 sentences in one document?
def check_sentence_length(example_sets):
    check_lemmas={}
    for example_set in example_sets:
        for s in example_set:
            pos_list_text=[]
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            text=s.split("\t")[1].split("\n")[0]
            split_text = nltk.sent_tokenize(text) # this gives us a list of sentences
            if len(split_text) not in check_lemmas:
                check_lemmas[len(split_text)]=0
            check_lemmas[len(split_text)]+=1
    return check_lemmas

#Imply the model
sentence_length_dict=check_sentence_length((nyt_edu_original,nyt_fin_original,nyt_law_original,nyt_med_original,nyt_mil_original,nyt_pol_original))
count_more_100=0
general_number_documents=sum(sentence_length_dict.values())
for k,v in sentence_length_dict.items():
    if k>100:
        count_more_100+=v
percentage_less_100=(count_more_100/general_number_documents)*100
print("What is the percentage of documents in all documents that have more than 100 sentences?",percentage_less_100)
#This means that more than 98% of the whole dataset contains of sentences with length less than 100.

What is the percentage of documents in all documents that have more than 100 sentences? 1.2593936252915263


In [5]:
listofzeros = [0] *100
tokenizer = RegexpTokenizer(r'\w+')
def pos_tagger(pos_example_dataset):
    pos_data=[]
    pos_label=[]
    for s in pos_example_dataset:
        pos_list_text=[]
        text=s.split("\t")[1].split("\n")[0]
        split_text = nltk.sent_tokenize(text)
        if len(split_text)<=100:
            for sent in split_text:
                pos_list_sentence=[]
                pos_analyzed=nltk.pos_tag(tokenizer.tokenize(sent))
                for word in pos_analyzed:
                    try:
                        pos_list_sentence.append(pos_tags[word[1]])
                    except:
                        pass
                pos_list_sentence=pad(pos_list_sentence,100,0)
                pos_list_text.append(pos_list_sentence)
            label=s.split("\t")[0]
            if label=='editorial':
                label=0
            elif label=='news':
                label=1
            if len(pos_list_text)<100:
                pos_list_text=pad(pos_list_text,100,listofzeros)
            pos_tensor=tf.convert_to_tensor(pos_list_text)
            pos_data.append(pos_tensor)
            pos_label.append(label)
    return pos_data,pos_label

In [9]:
edu_pos_data,edu_pos_labels=pos_tagger(nyt_edu_original)
fin_pos_data,fin_pos_labels=pos_tagger(nyt_fin_original)
law_pos_data,law_pos_labels=pos_tagger(nyt_law_original)
med_pos_data,med_pos_labels=pos_tagger(nyt_med_original)
mil_pos_data,mil_pos_labels=pos_tagger(nyt_mil_original)
pol_pos_data,pol_pos_labels=pos_tagger(nyt_pol_original)

2023-03-28 19:17:58.891155: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-28 19:17:59.816127: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10413 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:83:00.0, compute capability: 6.1


In [12]:
#Model of POS.
input_arg = Input(shape=pol_pos_data[0].shape)
model_arg= SimpleRNN(128, dropout=0.1)(input_arg)
dense_pred = (Dense(1, activation='sigmoid'))(model_arg)
model_pos= Model(inputs=input_arg, outputs=dense_pred)
opt = keras.optimizers.Adam(learning_rate=0.01)
model_pos.compile(loss='binary_crossentropy', optimizer=opt , metrics=['accuracy',f1_score])
print(model_pos.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100, 100)]        0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               29312     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 29,441
Trainable params: 29,441
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
X_train_pos, X_test_pos, y_train_pos, y_test_pos= train_test_split(pol_pos_data,pol_pos_labels, 
    test_size=0.2, random_state= 42)
model_pos.fit(np.array(X_train_pos),np.array(y_train_pos),
          batch_size=32,
          epochs=5,
          validation_data=(np.array(X_test_pos),np.array(y_test_pos)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f57f00f1700>

In [15]:
for features, labels, name in zip([np.array(edu_pos_data),np.array(fin_pos_data),np.array(law_pos_data),np.array(med_pos_data),np.array(mil_pos_data)],[np.array(edu_pos_labels),np.array(fin_pos_labels),np.array(law_pos_labels),np.array(med_pos_labels),np.array(mil_pos_labels)],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc,f1= model_pos.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

nyt_edu
Test score: 0.6248546838760376
Test accuracy: 0.7261320352554321
F1 score: 0.8364138603210449
nyt_fin
Test score: 0.4789609909057617
Test accuracy: 0.8235870599746704
F1 score: 0.9012267589569092
nyt_law
Test score: 0.603351891040802
Test accuracy: 0.7355113625526428
F1 score: 0.8446060419082642
nyt_med
Test score: 0.6050265431404114
Test accuracy: 0.7357268929481506
F1 score: 0.8434920907020569
nyt_mil
Test score: 0.5162953734397888
Test accuracy: 0.7955513596534729
F1 score: 0.8845686912536621


# VADER+POS merged

In [23]:
#The architecture of  VADER+POS merged
input_sentiment = Input(shape=(100,))
model_sentiment = Embedding(3, 128)(input_sentiment)
model_sentiment = SimpleRNN(128, dropout=0.2)(model_sentiment)

input_pos= Input(pol_pos_data[0].shape)
model_pos= SimpleRNN(128, dropout=0.1)(input_pos)

merged = Concatenate()([model_sentiment, model_pos])
dense_pred = (Dense(1, activation='sigmoid'))(merged)

model_merged= Model(inputs=[input_sentiment,input_pos], outputs=dense_pred)
model_merged.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model_merged.summary())

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 100, 128)     384         ['input_6[0][0]']                
                                                                                                  
 input_7 (InputLayer)           [(None, 100, 100)]   0           []                               
                                                                                                  
 simple_rnn_5 (SimpleRNN)       (None, 128)          32896       ['embedding_2[0][0]']            
                                                                                            

In [26]:
#Need to transfer the list to a tensort so we can combine with POS data.
X_train_sent=tf.stack(X_train_sent, axis=0)
X_test_sent=tf.stack(X_test_sent, axis=0)
X_train_pos=tf.stack(X_train_pos, axis=0)
X_test_pos=tf.stack(X_test_pos, axis=0)

In [27]:
#education tranform into a tensor
edu_sentiments=tf.stack(edu_sentiments, axis=0)
edu_labels=np.array(edu_labels)
#finances tranform into a tensor
fin_sentiments=tf.stack(fin_sentiments, axis=0)
fin_labels=np.array(fin_labels)
#law tranform into a tensor
law_sentiments=tf.stack(law_sentiments, axis=0)
law_labels=np.array(law_labels)
#medicine tranform into a tensor
med_sentiments=tf.stack(med_sentiments, axis=0)
med_labels=np.array(med_labels)
#military tranform into a tensor
mil_sentiments=tf.stack(mil_sentiments, axis=0)
mil_labels=np.array(mil_labels)

In [28]:
#Test
model_merged.fit([X_train_sent,X_train_pos],np.array(y_train_sent),
          batch_size=32,
          epochs=5,
          validation_data=([X_test_sent, X_test_pos ],np.array(y_test_sent)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f574c23e370>

In [29]:
for features, labels, name in zip([[edu_sentiments,np.array(edu_pos_data)],[fin_sentiments,np.array(fin_pos_data)],[law_sentiments,np.array(law_pos_data)],[med_sentiments,np.array(med_pos_data)],[mil_sentiments,np.array(mil_pos_data)]],[edu_labels,fin_labels,law_labels,med_labels,mil_labels],['nyt_edu', 'nyt_fin', 'nyt_law', 'nyt_med',"nyt_mil"]):
    print(name)
    score, acc,f1= model_merged.evaluate(features, labels, batch_size=32)
    print('Test score:', score)
    print('Test accuracy:', acc)
    print('F1 score:', f1)

nyt_edu
Test score: 0.6002344489097595
Test accuracy: 0.7261320352554321
F1 score: 0.8364138603210449
nyt_fin
Test score: 0.46895769238471985
Test accuracy: 0.8235870599746704
F1 score: 0.9012267589569092
nyt_law
Test score: 0.587472677230835
Test accuracy: 0.7355113625526428
F1 score: 0.8446060419082642
nyt_med
Test score: 0.5870500802993774
Test accuracy: 0.7357268929481506
F1 score: 0.8434920907020569
nyt_mil
Test score: 0.5065819621086121
Test accuracy: 0.7955513596534729
F1 score: 0.8845686912536621
