In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, regexp_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import sys
import os
import string
import re

import torch
import torch.nn as nn

from utils import dataset, generate_test_splits


### Using the included data importer

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
train_data = dataset.DataSet(name='train')

Reading dataset
Total stances: 49972
Total bodies: 1683


### Converting imported data into pandas dataframes

In [4]:
train_data_articles_df = pd.DataFrame({'Article': train_data.articles.values(), 'Body ID':train_data.articles.keys()} , index=train_data.articles.keys())
train_data_stances_df = pd.DataFrame(train_data.stances)

# checking out example output
print(train_data_articles_df['Article'][158] + '\n')
print(train_data_stances_df['Headline'][158] + '\n')
print(train_data_stances_df['Stance'][158])

Hundreds of Palestinians were evacuated from their homes Sunday morning after Israeli authorities opened a number of dams near the border, flooding the Gaza Valley in the wake of a recent severe winter storm.

The Gaza Ministry of Interior said in a statement that civil defense services and teams from the Ministry of Public Works had evacuated more than 80 families from both sides of the Gaza Valley (Wadi Gaza) after their homes flooded as water levels reached more than three meters.

Gaza has experienced flooding in recent days amid a major storm that saw temperatures drop and frigid rain pour down.

The storm displaced dozens and caused hardship for tens of thousands, including many of the approximately 110,000 Palestinians left homeless by Israel's assault over summer.

The suffering is compounded by the fact that Israel has maintained a complete siege over Gaza for the last eight years, severely limiting electricity and the availability of fuel for generators. It has also prevented

### Preprocessing: Tokenize and remove stopwords. 

Can do more, obvious ones: remove news agency header, remove website URL, remove twitter usernames, stemming, lemmatizating

In [5]:
tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b') # regex can be improved but idk
train_data_articles_df['article_cleaned'] = train_data_articles_df['Article'].apply(lambda x: tokenizer.tokenize(x.lower()))
train_data_stances_df['headline_cleaned'] = train_data_stances_df['Headline'].apply(lambda x: tokenizer.tokenize(x.lower()))

# remove_whitespace = r'\s+'
# train_data_articles_df['article_cleaned'] = train_data_articles_df['Article'].apply(lambda x: re.split(remove_whitespace, x))
# train_data_stances_df['headline_cleaned'] = train_data_stances_df['Headline'].apply(lambda x: re.split(remove_whitespace, x))

# exclude = r'[^/d/W]+'
# train_data_articles_df['article_cleaned'] = train_data_articles_df['article_cleaned'].apply(lambda x: re.findall(exclude, x))
# train_data_stances_df['headline_cleaned'] = train_data_stances_df['headline_cleaned'].apply(lambda x: re.findall(exclude, x))




In [6]:
print(train_data_articles_df['Article'][154])
print(train_data_articles_df['article_cleaned'][154])
print(train_data_stances_df['headline_cleaned'][154])
print(train_data_stances_df['Stance'][154])

Thousands of people have been duped by a fake news story claiming that Nasa has forecast a total blackout of earth for six days in December.

The story, entitled "Nasa Confirms Earth Will Experience 6 Days of Total Darkness in December 2014!" originated from Huzlers.com, a website well known for publishing fake stories with sensational headlines.

The bogus report read: "Nasa has confirmed that the Earth will experience 6 days of almost complete darkness and will happen from the dates Tuesday the 16 – Monday the 22 in December. The world will remain, during these three days, without sunlight due to a solar storm, which will cause dust and space debris to become plentiful and thus, block 90% sunlight.

"The head of Nasa Charles Bolden who made the announcement and asked everyone to remain calm. This will be the product of a solar storm, the largest in the last 250 years for a period of 216 hours total.

"Despite the six days of darkness soon to come, officials say that the earth will no

In [7]:
website_words = ['http', 'twitter', 'com', 'pic', 'co']
month_words = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'novemeber', 'december']

my_banned_words = stopwords.words('english') + website_words + month_words


In [8]:
def remove_stopwords(string):
    output = []
    for word in string:
        if word not in my_banned_words:
            output.append(word)
    return output

train_data_articles_df['article_cleaned'] =  train_data_articles_df['article_cleaned'].apply(lambda x: remove_stopwords(x))
train_data_stances_df['headline_cleaned'] =  train_data_stances_df['headline_cleaned'].apply(lambda x: remove_stopwords(x))


In [9]:
print(train_data_articles_df['article_cleaned'][154])


['thousands', 'people', 'duped', 'fake', 'news', 'story', 'claiming', 'nasa', 'forecast', 'total', 'blackout', 'earth', 'six', 'days', 'story', 'entitled', 'nasa', 'confirms', 'earth', 'experience', 'days', 'total', 'darkness', 'originated', 'huzlers', 'website', 'well', 'known', 'publishing', 'fake', 'stories', 'sensational', 'headlines', 'bogus', 'report', 'read', 'nasa', 'confirmed', 'earth', 'experience', 'days', 'almost', 'complete', 'darkness', 'happen', 'dates', 'tuesday', 'monday', 'world', 'remain', 'three', 'days', 'without', 'sunlight', 'due', 'solar', 'storm', 'cause', 'dust', 'space', 'debris', 'become', 'plentiful', 'thus', 'block', 'sunlight', 'head', 'nasa', 'charles', 'bolden', 'made', 'announcement', 'asked', 'everyone', 'remain', 'calm', 'product', 'solar', 'storm', 'largest', 'last', 'years', 'period', 'hours', 'total', 'despite', 'six', 'days', 'darkness', 'soon', 'come', 'officials', 'say', 'earth', 'experience', 'major', 'problems', 'since', 'six', 'days', 'darkn

In [10]:
if not os.path.exists('processed'):
    os.mkdir('processed')

train_data_articles_df.to_csv("processed/processed_train_articles.csv")
train_data_stances_df.to_csv("processed/processed_train_stances.csv")

In [11]:
stance_df = train_data_stances_df[train_data_stances_df['Body ID'] == 712]
print(stance_df.index.values)

[    0  1787  3974  4936  5210  5863  6199  6756  7526  9003 10036 10780
 11687 11864 15746 21620 21712 21928 22100 25006 25492 25616 26260 26398
 27200 29988 33683 37095 38326 41035 42776 43897 44978 45222 45579 46530
 47712 47850 48228]



 ### Strategy: concatenate headline with article, encode as bag of words, tfidf score
 
TODO: more to try: only tf, tf of headline and article + tfidf cosine similarity (ranked 3rd group's strat), process headline and article separately (can then run an algorithm separately on each part, then concat and run fully connected) 

In [12]:
stance_to_number = {
    "agree": 0,
    "disagree": 1,
    "discuss": 2,
    "unrelated": 3
}

        
def concat_data(articles_df, headlines_df, body_ids):
    features = []
    labels = []
    
    for body_id in body_ids:
        article = articles_df[articles_df['Body ID'] == body_id]['article_cleaned'].values[0]
    #     print(article)
        headlines = headlines_df[headlines_df['Body ID'] == body_id]['headline_cleaned'].values
    #     print(headlines)
        stances = headlines_df[headlines_df['Body ID'] == body_id]['Stance'].values
    #     print(stances)
        for headline, stance in zip(headlines, stances):
            features.append(headline+article)
            labels.append(stance_to_number[stance])
    return features, labels

body_ids = train_data_articles_df['Body ID'].copy().values
print(body_ids[0:10])
np.random.seed(42) # set your seed
np.random.shuffle(body_ids) # randomise it here and then do the train/test split later directly. 
print(body_ids[0:10])
features, labels = concat_data(train_data_articles_df, train_data_stances_df, body_ids) # article, headline and stances are retrieved at the same time so splitting later is fine


[ 0  4  5  6  7  8  9 10 11 13]
[2102 1120 2414   75  298  720 1340  931 2448 2032]


In [14]:
print(len(features))

49972


In [None]:
# if not os.path.exists('processed'):
#     os.mkdir('processed')

# features.to_csv("processed/processed_train_articles.csv")
# labels.to_csv("processed/processed_train_stances.csv")
# TODO: save the bag of words word count here

In [15]:
# dummy preprocessor and tokenizer because already did that above. not going to change it up to fit this format
def dummy(doc):
    return doc
vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy)
features_counts = vectorizer.fit_transform(features)


print(features_counts.shape)

(49972, 22328)


In [16]:
# NOW do the train test split. 
# Vectorise and get bag of words for both train + test first so that the features are the same length

train_percentage = 0.8
num_samples = features_counts.shape[0]
split_index = int(train_percentage*num_samples)
train_features_counts = features_counts[:split_index]
test_features_counts = features_counts[split_index:]
train_labels = labels[:split_index]
test_labels = labels[split_index:]

In [17]:
train_tfidf_transformer = TfidfTransformer()
train_features_tfidf = train_tfidf_transformer.fit_transform(train_features_counts)

test_tfidf_transformer = TfidfTransformer()
test_features_tfidf = test_tfidf_transformer.fit_transform(test_features_counts)

print(train_features_tfidf.shape)

(39977, 22328)


In [18]:
model_naive_bayes = MultinomialNB()
model_naive_bayes.fit(train_features_tfidf, train_labels)


MultinomialNB()

In [19]:
model_naive_bayes_predicted = model_naive_bayes.predict(test_features_tfidf)
accuracy = np.mean(model_naive_bayes_predicted == test_labels)
print(accuracy)

0.6499249624812407
