In [1]:
import csv

In [2]:
# create a class to read the dataset
class Dataset():
  def __init__(self, train_stance, test_stance, train_body, test_body):
    self.train_stance = train_stance
    self.test_stance = test_stance
    self.train_body = train_body
    self.test_body = test_body

    print("Dataset length:")

    self.train_stances = self.read_stance(self.train_stance)
    self.test_stances = self.read_stance(self.test_stance)
    self.train_bodies = self.read_body(self.train_body)
    self.test_bodies = self.read_body(self.test_body)

    print("Total train stances: " + str(len(self.train_stances)))
    print("Total test stances: " + str(len(self.test_stances)))
    print("Total train bodies: " + str(len(self.train_bodies)))
    print("Total test bodies: " + str(len(self.test_bodies)))

  def read_stance(self, path):
    rows = []
    with open(path, encoding='utf-8', errors='ignore') as csvfile:
      r = csv.DictReader(csvfile)
      for row in r:
        rows.append([row['Body ID'], row['Headline'], row['Stance']])
    return rows

  def read_body(self, path):
    rows = []
    #with open(path, encoding='utf-8') as csvfile:
    with open(path, encoding="utf8", errors='ignore') as csvfile:
      r = csv.DictReader(csvfile)
      for row in r:
        rows.append([row['Body ID'], row['articleBody']])
        #rows[row['Body ID']] = row['articleBody']
    return rows

In [3]:
# load the data
data = Dataset('./fnc1/train_stances.csv', './fnc1/competition_test_stances.csv', './fnc1/train_bodies.csv', './fnc1/competition_test_bodies.csv')

Dataset length:
Total train stances: 49972
Total test stances: 25413
Total train bodies: 1683
Total test bodies: 904


In [4]:
data.train_stances[0]

data.test_stances[0]

['2008',
 'Ferguson riots: Pregnant woman loses eye after cops fire BEAN BAG round through car window',
 'unrelated']

In [5]:
data.train_bodies[0]
data.test_bodies[0]

['1',
 'Al-Sisi has denied Israeli reports stating that he offered to extend the Gaza Strip.']

In [6]:
# pre-process function (lowercase, stopwords, lemmatization)

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

porter = PorterStemmer()
stopwords = stopwords.words("english")

def preprocess(processed_dataset):
  for w in processed_dataset:
    words = []
    w[1] = word_tokenize(w[1])
    for token in w[1]:
      lower = token.lower()
      if lower not in stopwords and lower.isalpha():
        word = porter.stem(lower)
        words.append(word)
    w[1] = words
  return processed_dataset

In [7]:
# call pre-process functions for train and test
train_headline = preprocess(data.train_stances)
test_headline = preprocess(data.test_stances)
train_content = preprocess(data.train_bodies)
test_content = preprocess(data.test_bodies)


In [8]:
print(train_headline[0])
print(test_headline[0])
print(train_content[0])
print(test_content[0])

['0', ['soldier', 'shot', 'parliament', 'lock', 'gunfir', 'erupt', 'war', 'memori'], 'unrelated']
['2008', ['ferguson', 'riot', 'pregnant', 'woman', 'lose', 'eye', 'cop', 'fire', 'bean', 'bag', 'round', 'car', 'window'], 'unrelated']
['0', ['small', 'meteorit', 'crash', 'wood', 'area', 'nicaragua', 'capit', 'managua', 'overnight', 'govern', 'said', 'sunday', 'resid', 'report', 'hear', 'mysteri', 'boom', 'left', 'deep', 'crater', 'near', 'citi', 'airport', 'associ', 'press', 'report', 'govern', 'spokeswoman', 'rosario', 'murillo', 'said', 'committe', 'form', 'govern', 'studi', 'event', 'determin', 'rel', 'small', 'meteorit', 'appear', 'come', 'asteroid', 'pass', 'close', 'earth', 'asteroid', 'rc', 'measur', 'feet', 'diamet', 'skim', 'earth', 'weekend', 'abc', 'news', 'report', 'murillo', 'said', 'nicaragua', 'ask', 'intern', 'expert', 'help', 'local', 'scientist', 'understand', 'happen', 'crater', 'left', 'meteorit', 'radiu', 'feet', 'depth', 'feet', 'said', 'humberto', 'saballo', 'volc

In [9]:
# put all sentences into one collection for word2vec training.
# It takes word tokenized sentences, which looks like ["hello", "world", ... ]

sent_collection = []

def sent_list(s_list, t_list):
  for sent in s_list:
    t_list.append(sent[1])
  return

sent_list(train_headline, sent_collection)
sent_list(test_headline, sent_collection)
sent_list(train_content, sent_collection)
sent_list(test_content, sent_collection)

In [10]:
print(sent_collection[0])

['soldier', 'shot', 'parliament', 'lock', 'gunfir', 'erupt', 'war', 'memori']


In [11]:
# to see how many unique words in the collection
bag = []
for s in sent_collection:
  for w in s:
    bag.append(w)

p = set(bag)
print(len(p))



18084


In [12]:
# train the word2vec with customized words because the stemming makes some words not recognizable, such as "polic" and "strang"
# details  https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

#import multiprocessing
from gensim.models import Word2Vec

#cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=1,
                    window=2,
                    size=100,
                    sample=6e-5, 
                    alpha=0.03, 
                    min_alpha=0.0007, 
                    negative=20)


In [13]:
w2v_model.build_vocab(sent_collection)

In [15]:
w2v_model.train(sent_collection, total_examples=w2v_model.corpus_count, epochs=30)

(17339019, 33276930)

In [16]:
# this makes the memory more efficient since we do not plan tot train any further
w2v_model.init_sims(replace=True)

In [18]:
len(w2v_model.wv.vocab.keys())

18084

In [27]:
print(model.wv.vector_size)

100


In [20]:
w2v_model.wv.get_vector('polic')

array([-0.03170666,  0.05994361, -0.01594953, -0.01158441, -0.2553048 ,
        0.15896626, -0.05707419,  0.06622464,  0.1167105 ,  0.02071714,
        0.01473159,  0.08760121,  0.02629115,  0.06877466,  0.14730984,
        0.00831118, -0.00957257,  0.0364738 ,  0.01016295, -0.00858984,
        0.01968961, -0.12182599, -0.00251548, -0.10242035,  0.01302129,
       -0.03282655, -0.11320949,  0.11239958,  0.10937969, -0.10672441,
       -0.00179643, -0.04826476,  0.01923433,  0.18598905,  0.15126215,
       -0.17327411, -0.02258573, -0.0134278 ,  0.0591633 ,  0.15780906,
        0.09686403, -0.03460663, -0.20300788, -0.06884312,  0.10651762,
       -0.02109622,  0.09848445,  0.09285858, -0.01213906, -0.0651632 ,
        0.10628242, -0.02083327, -0.03462162, -0.08759736,  0.21147734,
        0.04124995,  0.00487584,  0.03875668,  0.0316501 ,  0.00573582,
        0.13343246,  0.10616208,  0.11232797, -0.03080473, -0.1765491 ,
        0.01725558, -0.21733336,  0.0200403 ,  0.1301482 , -0.19

In [22]:
w2v_model.wv.similarity('polic', 'strang')

0.053465083

In [23]:
# combine headline and body
import nltk
def comb_list (stance, body, target):
  for i in body:
    for j in stance:
      if j[0] == i[0]:
        i[1] = nltk.FreqDist(i[1])
        target.append([j[0], j[1], i[1], j[2]])
  return


In [24]:
# train set
train_set = []
test_set = []

comb_list(train_headline, train_content, train_set)
comb_list(test_headline, test_content, test_set)

In [25]:
print(len(train_set))

49972


In [26]:
print(len(test_set))

25413
