In [1]:
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.utils.np_utils import to_categorical
import _pickle as cPickle 
import pandas as pd

from keras.models import load_model
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
bodies = pd.read_csv('data/train_bodies.csv')
stances = pd.read_csv('data/train_stances.csv')

In [3]:
#Associate titles with bodies by ID
bd = []
for i in range(stances['Body ID'].shape[0]):
    index = stances['Body ID'][i]
    bd.append(bodies[bodies['Body ID']==index]['articleBody'].values[0])

In [4]:
se = pd.Series(bd)
stances['articleBody'] = se.values
#stances.columns
stances = stances[['Body ID','Headline','articleBody','Stance',]]

In [5]:
stances.sample(10)

Unnamed: 0,Body ID,Headline,articleBody,Stance
32252,520,Continuing Violence Puts Boko Haram Ceasefire ...,The Islamist militant group Boko Haram has agr...,discuss
14055,1819,Bali spider burrows under Aussie’s chest,A holidaymaker was left horrified after discov...,agree
45207,1519,KC hospital: Patient does not have symptom pro...,"First, there was #BendGate. Reports flooded th...",unrelated
29305,1660,11 jetliners 'missing' after Islamist takeover...,"Years ago, I had a job in China where I evalua...",unrelated
46505,527,IRAQI AND KURDISH MEDIA REPORTS: ISIS FIGHTERS...,A hallucinogenic fungi has been found growing ...,unrelated
47963,1122,"No, there aren’t any Ebola cases in Iraq","Forget targeted US airstrikes, ISIS faces a ne...",discuss
16553,139,Isis militants claim to have killed US journal...,Judd Nelson rebuffs Internet rumors that he di...,unrelated
22944,2463,Ebola Outbreak 2015 Update: ISIS Fighters May ...,"ABUJA, Nigeria — The leader of Nigeria's Islam...",unrelated
12808,2372,Low-level marijuana possession could soon land...,Mayor de Blasio and NYPD Commissioner Bill Bra...,discuss
28897,2067,BREAKING NEWS: ISIS beheads missing American j...,President Barack Obama denounced Islamic State...,agree


In [6]:
#create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = stances['articleBody']
trainDF['label'] = stances['Stance']

In [7]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF.drop([27670])['text'],trainDF.drop([27670])['label'],stratify=trainDF.drop([27670])['label'],shuffle=True,random_state=1410)
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [8]:
# !wget -O ./data/embedding https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip # download embedding
# !unzip -o ./data/embedding -d ./data/  #Unzip embedding
# !rm ./data/embedding #Remove zip file

In [9]:
#https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip
#Embedding the words
def load_vectors(fname,nb):
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    count = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = numpy.asarray(tokens[1:], dtype='float32')
        count += 1
        if count > nb:
            break
    return data

#Only load the most frequent words 150k
embeddings_index = load_vectors('data/wiki-news-300d-1M.vec',150000) 

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
#Text classification models

In [10]:
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print "RNN-LSTM, Word Embeddings",  accuracy

In [None]:
def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_gru()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print "RNN-GRU, Word Embeddings",  accuracy