In [2]:
import sys
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier

In [3]:
from csv import DictReader


class DataSet():
    def __init__(self, name="train", path="fnc-1-baseline/fnc-1"):
        self.path = path

        print("Reading dataset")
        bodies = name+"_bodies.csv"
        stances = name+"_stances.csv"

        self.stances = self.read(stances)
        articles = self.read(bodies)
        self.articles = dict()

        #make the body ID an integer value
        for s in self.stances:
            s['Body ID'] = int(s['Body ID'])

        #copy all bodies into a dictionary
        for article in articles:
            self.articles[int(article['Body ID'])] = article['articleBody']

        print("Total stances: " + str(len(self.stances)))
        print("Total bodies: " + str(len(self.articles)))



    def read(self,filename):
        rows = []
        with open(self.path + "/" + filename, "r", encoding='utf-8') as table:
            r = DictReader(table)

            for line in r:
                rows.append(line)
        return rows


In [4]:
d = DataSet()

Reading dataset
Total stances: 49972
Total bodies: 1683


In [5]:
import os
import re
import nltk
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm


_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
    if not os.path.isfile(feature_file):
        feats = feat_fn(headlines, bodies)
        np.save(feature_file, feats)

    return np.load(feature_file)




def word_overlap_features(headlines, bodies):
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        features = [
            len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
        X.append(features)
    return X


def refuting_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_headline = get_tokenized_lemmas(clean_headline)
        features = [1 if word in clean_headline else 0 for word in _refuting_words]
        X.append(features)
    return X


def polarity_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]

    def calculate_polarity(text):
        tokens = get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        features = []
        features.append(calculate_polarity(clean_headline))
        features.append(calculate_polarity(clean_body))
        X.append(features)
    return np.array(X)


def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output


def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output


def append_chargrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
        if gram in text_body[:100]:
            grams_first_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    features.append(grams_first_hits)
    return features


def append_ngrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in ngrams(text_headline, size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    return features


def hand_features(headlines, bodies):

    def binary_co_occurence(headline, body):
        # Count how many times a token in the title
        # appears in the body text.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean(headline).split(" "):
            if headline_token in clean(body):
                bin_count += 1
            if headline_token in clean(body)[:255]:
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def binary_co_occurence_stops(headline, body):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count = 0
        bin_count_early = 0
        for headline_token in remove_stopwords(clean(headline).split(" ")):
            if headline_token in clean(body):
                bin_count += 1
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph

        clean_body = clean(body)
        clean_headline = clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features

    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        X.append(binary_co_occurence(headline, body)
                 + binary_co_occurence_stops(headline, body)
                 + count_grams(headline, body))


    return X


In [19]:
d.stances[0]
for stance in d.stances:
    tempStance = stance['Stance']
    if tempStance == 'disagree':
        stance['Stance'] = 'unrelated'
    elif tempStance != 'unrelated':
        stance['Stance'] = 'related'

In [20]:
stanceDict = {}
for stance in d.stances:
    #print(type(stance['Body ID']))
    if stance['Body ID'] not in stanceDict:
        stanceDict[stance['Body ID']] = []
    stanceDict[stance['Body ID']].append(stance)

In [28]:
headlines = []
bodies = []
y_out = []
for stance in d.stances:
    bid = stance['Body ID']
    bodies.append(d.articles[bid])
    headlines.append(stance['Headline'])
    y_out.append(stance['Stance']=='related')

In [25]:
headlines[0], bodies[0],y_out[0]

("Police find mass graves with at least '15 bodies' near Mexico town where 43 students disappeared after police clash",
 'Danny Boyle is directing the untitled film\n\nSeth Rogen is being eyed to play Apple co-founder Steve Wozniak in Sony’s Steve Jobs biopic.\n\nDanny Boyle is directing the untitled film, based on Walter Isaacson\'s book and adapted by Aaron Sorkin, which is one of the most anticipated biopics in recent years.\n\nNegotiations have not yet begun, and it’s not even clear if Rogen has an official offer, but the producers — Scott Rudin, Guymon Casady and Mark Gordon — have set their sights on the talent and are in talks.\n\nOf course, this may all be for naught as Christian Bale, the actor who is to play Jobs, is still in the midst of closing his deal. Sources say that dealmaking process is in a sensitive stage.\n\nInsiders say Boyle will is flying to Los Angeles to meet with actress to play one of the female leads, an assistant to Jobs. Insiders say that Jessica Chastain

In [27]:
X = polarity_features(headlines,bodies)

49972it [04:05, 203.85it/s]


In [33]:
X[0]

array([0, 0])

In [36]:
train_X = X[:len(X)//2]
val_X = X[len(X)//2:]
y_out_train = y_out[:len(y_out)//2]
y_out_val = y_out[len(y_out)//2:]

In [40]:
len(train_X), len(val_X), len(y_out_train), len(y_out_val)

(24986, 24986, 24986, 24986)

In [41]:
from sklearn import svm
clf = svm.SVC()
clf.fit(train_X, y_out_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [42]:
val_pred = clf.predict(val_X)

In [43]:
correct = [ i==j for i,j in zip(val_pred,y_out_val)]

In [44]:
sum(correct)*1.0/len(correct)

0.74705835267749943

In [60]:
sum(val_pred)

0

In [65]:
len(y_out_train),sum(y_out_train)

(24986, 6267)

In [None]:
#Testing

In [46]:
testD = DataSet(name="competition_test")

Reading dataset
Total stances: 25413
Total bodies: 904


In [47]:
testD.stances[0]
for stance in testD.stances:
    tempStance = stance['Stance']
    if tempStance == 'disagree':
        stance['Stance'] = 'unrelated'
    elif tempStance != 'unrelated':
        stance['Stance'] = 'related'

In [49]:
headlines_test = []
bodies_test = []
y_out_test = []
for stance in testD.stances:
    bid = stance['Body ID']
    bodies_test.append(testD.articles[bid])
    headlines_test.append(stance['Headline'])
    y_out_test.append(stance['Stance']=='related')

In [51]:
len(headlines_test),len(bodies_test), len(y_out_test)

(25413, 25413, 25413)

In [52]:
X_test = polarity_features(headlines_test,bodies_test)

25413it [01:51, 82.47it/s] 


In [53]:
test_pred = clf.predict(X_test)

In [54]:
correct = [ i==j for i,j in zip(test_pred,y_out_test)]
sum(correct)*1.0/len(correct)

0.74945893833864563

In [58]:
sum(y_out_test)*1.0/len(y_out_test) + 0.749

0.9995410616613545

In [59]:
sum(test_pred)

0