In [42]:
from collections import Counter
import numpy as np
from csv import DictReader
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential, Model, load_model
from keras.layers import concatenate, Embedding, Dense, Dropout, Lambda, Activation, LSTM, Flatten, Input, RepeatVector, TimeDistributed, Bidirectional
from keras.optimizers import Adam, RMSprop
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, TensorBoard
import keras.backend as K
import codecs
import pickle

In [108]:
label_ref = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}
label_vec = {'agree': [1,0,0,0], 'disagree': [0,1,0,0], 'discuss': [0,0,1,0], 'unrelated': [0,0,0,1]}
label_ref_rev = {0: 'agree', 1: 'disagree', 2: 'discuss', 3: 'unrelated'}
stop_words = [
        "a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along",
        "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
        "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
        "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "co",
        "con", "could", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight",
        "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
        "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for",
        "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had",
        "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
        "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest",
        "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made",
        "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much",
        "must", "my", "myself", "name", "namely", "neither", "nevertheless", "next", "nine", "nobody", "now", "nowhere",
        "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours",
        "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see",
        "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
        "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take",
        "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby",
        "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though",
        "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve",
        "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what",
        "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon",
        "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
        "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
        ]

In [2]:
MAX_LEN_HEAD = 100
MAX_LEN_BODY = 500
VOCAB_SIZE = 15000
EMBEDDING_DIM = 300

In [109]:
def load_train(file_instances, file_bodies, lim_unigram=VOCAB_SIZE):
    """
    article: the name of the article file
    """
    
    instance_lst = []
    # Process file
    with open(file_instances, "r", encoding='utf-8') as table:
        r = DictReader(table)
        for line in r:
            instance_lst.append(line)
            
    body_lst = []
    # Process file
    with open(file_bodies, "r", encoding='utf-8') as table:
        r = DictReader(table)
        for line in r:
            body_lst.append(line)
    
    headData = {}
    bodyData = {}
    
    for instance in instance_lst:
        if instance['Headline'] not in headData:
            head_id = len(headData)
            headData[instance['Headline']] = head_id
        instance['Body ID'] = int(instance['Body ID'])
    for body in body_lst:
        bodyData[int(body['Body ID'])] = body['articleBody']
    
            
    # Initialise
    heads = []
    heads_track = {}
    bodies = []
    bodies_track = {}
    body_ids = []
    id_ref = {}
    train_set = []
    train_stances = []
    cos_track = {}
    test_heads = []
    test_heads_track = {}
    test_bodies = []
    test_bodies_track = {}
    test_body_ids = []
    head_tfidf_track = {}
    body_tfidf_track = {}

    # Identify unique heads and bodies
    for instance in instance_lst:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            heads.append(head)
            heads_track[head] = 1
        if body_id not in bodies_track:
            bodies.append(bodyData[body_id])
            bodies_track[body_id] = 1
            body_ids.append(body_id)

    # Create reference dictionary
    for i, elem in enumerate(heads + body_ids):
        id_ref[elem] = i

    # Create vectorizers and BOW and TF arrays for train set
    bow_vectorizer = CountVectorizer(max_features=lim_unigram, stop_words=stop_words)
    bow = bow_vectorizer.fit_transform(heads + bodies)  # Train set only

    tfreq_vectorizer = TfidfTransformer(use_idf=False).fit(bow)
    tfreq = tfreq_vectorizer.transform(bow).toarray()  # Train set only

    tfidf_vectorizer = TfidfVectorizer(max_features=lim_unigram, stop_words=stop_words).\
        fit(heads + bodies)  # Train and test sets

    # Process train set
    for instance in instance_lst:
        head = instance['Headline']
        body_id = instance['Body ID']
        head_tf = tfreq[id_ref[head]].reshape(1, -1)
        body_tf = tfreq[id_ref[body_id]].reshape(1, -1)
        if head not in head_tfidf_track:
            head_tfidf = tfidf_vectorizer.transform([head]).toarray()
            head_tfidf_track[head] = head_tfidf
        else:
            head_tfidf = head_tfidf_track[head]
        if body_id not in body_tfidf_track:
            body_tfidf = tfidf_vectorizer.transform([bodyData[body_id]]).toarray()
            body_tfidf_track[body_id] = body_tfidf
        else:
            body_tfidf = body_tfidf_track[body_id]
        if (head, body_id) not in cos_track:
            tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1)
            cos_track[(head, body_id)] = tfidf_cos
        else:
            tfidf_cos = cos_track[(head, body_id)]
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        train_set.append(feat_vec)
        train_stances.append(label_vec[instance['Stance']])

    return np.array(train_set), np.array(train_stances), bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer

In [98]:
def load_test_data(list_headlines, body, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer):
    test_set = []
    body_bow = bow_vectorizer.transform([body]).toarray()
    body_tf = tfreq_vectorizer.transform(body_bow).toarray()[0].reshape(1, -1)
    body_tfidf = tfidf_vectorizer.transform([body]).toarray().reshape(1, -1)
    for headline in list_headlines:
        head_bow = bow_vectorizer.transform([headline]).toarray()
        head_tf = tfreq_vectorizer.transform(head_bow).toarray()[0].reshape(1, -1)
        head_tfidf = tfidf_vectorizer.transform([headline]).toarray().reshape(1, -1)
        
        tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1)
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        test_set.append(feat_vec)
    return test_set
    

In [126]:
fake_head = ["Trump is a good president"]
fake_body = 'Trump is fake'


In [127]:
try_set = load_test_data(fake_head, fake_body, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer)

In [110]:
train_set, train_stances, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer = load_train("train_stances.csv", "train_bodies.csv")

In [116]:
pickle.dump(bow_vectorizer, open('models/BOW_NLP/bow_vectorizer.pkl', 'wb'))
pickle.dump(tfreq_vectorizer, open('models/BOW_NLP/tfreq_vectorizer.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('models/BOW_NLP/tfidf_vectorizer.pkl', 'wb'))

In [112]:
val_set = train_set[-1000:]
train_set = train_set[:-1000]
# train_stances = OneHotEncoder(sparse=False).fit_transform(train_stances.reshape(-1, 1))
val_stances = train_stances[-1000:,:]
train_stances = train_stances[:-1000,:]

In [68]:
try_set = np.array(train_set[:5])
try_stances = np.array(train_stances[:5])

In [114]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=30001))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_37 (Dense)             (None, 100)               3000200   
_________________________________________________________________
dropout_25 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_26 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 4)                 404       
Total params: 3,010,704
Trainable params: 3,010,704
Non-trainable params: 0
_________________________________________________________________


In [70]:
try_stances

array([3, 0, 3, 3, 1])

In [72]:
try_stances = OneHotEncoder(sparse=False).fit_transform(try_stances.reshape(-1, 1))

In [71]:
try_stances[2] = 2

In [67]:
try_stances

array([[ 0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.]])

In [118]:
wt_dir = "./models/BOW_MLP/"
model_path = wt_dir+'BOW_MLP'+'{epoch:03d}'+'.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only =False, save_weights_only = False)
model.fit(train_set, train_stances,
          epochs=30,
          batch_size=128,
          validation_data=(val_set, val_stances),
          callbacks=[model_checkpoint])

Train on 48972 samples, validate on 1000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
 1664/48972 [>.............................] - ETA: 29s - loss: 0.3128 - acc: 0.8882

KeyboardInterrupt: 

In [92]:
model.save("./models/BOW_MLP"+"BOW_MLP.h5")

In [129]:
pred = model.predict(np.array(try_set))

In [130]:
np.argmax(pred, axis = 1)

array([2])