
# This notebook contains the code to obtain the Word Vectors for the FNC Dataset

# Mounting the Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pwd
%cd ./drive/MyDrive/SMAI\ project\ dataset
!pwd

/content
/content/drive/MyDrive/SMAI project dataset
/content/drive/MyDrive/SMAI project dataset


In [4]:
!pwd
!cd ./fnc-1-master/

/content/drive/MyDrive/SMAI project dataset


In [5]:
!ls
%cd ./fnc-1-master/

 checkpoint.pth		     main.ipynb		        NetModel.pth
 clickbait-17		     metrics.py		        PPr_Vector.ipynb
 clickbait17-train-170331    model.py		        pre-processing.ipynb
 clickbait_detection.ipynb  'Model Visualization'       __pycache__
 fnc-1-master		    'Model Visualization.png'   traditional_models.py
 input_functions.py	     NetModel2.pth
/content/drive/MyDrive/SMAI project dataset/fnc-1-master


# Loading the Dataset

In [6]:
import pandas as pd
import numpy as np

data = pd.read_csv("./train_bodies.csv")
labels = pd.read_csv("./train_stances.csv")

data = data.sort_values(by=['Body ID'])
labels = labels.sort_values(by=['Body ID'])

data = data.reset_index()
labels = labels.reset_index()

display(data.iloc[0:5])
display(labels.iloc[0:5])

Unnamed: 0,index,Body ID,articleBody
0,0,0,A small meteorite crashed into a wooded area i...
1,1,4,Last week we hinted at what was to come as Ebo...
2,2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,3,6,"Posting photos of a gun-toting child online, I..."
4,4,7,At least 25 suspected Boko Haram insurgents we...


Unnamed: 0,index,Headline,Body ID,Stance
0,27879,Soldier shot near Canadian parliament building,0,unrelated
1,21704,Caught a catfish record in Po: 127 kg and 2.67...,0,unrelated
2,7110,Enormous 20-stone catfish caught with fishing ...,0,unrelated
3,12573,Soldier shot at war memorial in Canada,0,unrelated
4,16307,A soldier has been shot at Canada’s war memori...,0,unrelated


In [7]:
body_ids = labels.iloc[:, 2]

In [8]:
print(body_ids.shape)
duplicated_bodies = []
for i in body_ids:
  temp = np.where(data.iloc[:, 1] == i)[0][0]
  duplicated_bodies.append(data.iloc[temp, 2])

print(len(duplicated_bodies))

(49972,)
49972


In [9]:
labels['Body Text'] = duplicated_bodies
display(labels.iloc[0:10])

Unnamed: 0,index,Headline,Body ID,Stance,Body Text
0,27879,Soldier shot near Canadian parliament building,0,unrelated,A small meteorite crashed into a wooded area i...
1,21704,Caught a catfish record in Po: 127 kg and 2.67...,0,unrelated,A small meteorite crashed into a wooded area i...
2,7110,Enormous 20-stone catfish caught with fishing ...,0,unrelated,A small meteorite crashed into a wooded area i...
3,12573,Soldier shot at war memorial in Canada,0,unrelated,A small meteorite crashed into a wooded area i...
4,16307,A soldier has been shot at Canada’s war memori...,0,unrelated,A small meteorite crashed into a wooded area i...
5,37891,Canadian Soldier Shot At Ottawa War Memorial: ...,0,unrelated,A small meteorite crashed into a wooded area i...
6,37896,Iraqi social-media rumors claim IS leader slain,0,unrelated,A small meteorite crashed into a wooded area i...
7,35767,Breaking: Soldier shot at National War Memoria...,0,unrelated,A small meteorite crashed into a wooded area i...
8,44961,Kurds fear Isis use of chemical weapon in Kobani,0,unrelated,A small meteorite crashed into a wooded area i...
9,4740,Giant 8ft 9in catfish weighing 19 stone caught...,0,unrelated,A small meteorite crashed into a wooded area i...


In [10]:
labels.iloc[:, 3] = np.where(labels.iloc[:, 3] == 'unrelated', 1, 0)
display(labels.iloc[0:10])

Unnamed: 0,index,Headline,Body ID,Stance,Body Text
0,27879,Soldier shot near Canadian parliament building,0,1,A small meteorite crashed into a wooded area i...
1,21704,Caught a catfish record in Po: 127 kg and 2.67...,0,1,A small meteorite crashed into a wooded area i...
2,7110,Enormous 20-stone catfish caught with fishing ...,0,1,A small meteorite crashed into a wooded area i...
3,12573,Soldier shot at war memorial in Canada,0,1,A small meteorite crashed into a wooded area i...
4,16307,A soldier has been shot at Canada’s war memori...,0,1,A small meteorite crashed into a wooded area i...
5,37891,Canadian Soldier Shot At Ottawa War Memorial: ...,0,1,A small meteorite crashed into a wooded area i...
6,37896,Iraqi social-media rumors claim IS leader slain,0,1,A small meteorite crashed into a wooded area i...
7,35767,Breaking: Soldier shot at National War Memoria...,0,1,A small meteorite crashed into a wooded area i...
8,44961,Kurds fear Isis use of chemical weapon in Kobani,0,1,A small meteorite crashed into a wooded area i...
9,4740,Giant 8ft 9in catfish weighing 19 stone caught...,0,1,A small meteorite crashed into a wooded area i...


In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Performing Preprocessing

In [12]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
stopword = stopwords.words('english')

In [13]:
test = "This is a test sentence. The best SENTENCES there could ever exist!! A an and the are stopwords?"

lower_text = test.lower()

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
word_tokens = tokenizer.tokenize(lower_text)
text_removed_stopwords = [word for word in word_tokens if word not in stopword]
lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in text_removed_stopwords]

print(lower_text)
print(text_removed_stopwords)
print(lemmatized_words)

this is a test sentence. the best sentences there could ever exist!! a an and the are stopwords?
['test', 'sentence', 'best', 'sentences', 'could', 'ever', 'exist', 'stopwords']
['test', 'sentence', 'best', 'sentence', 'could', 'ever', 'exist', 'stopwords']


In [14]:
regexp = nltk.tokenize.RegexpTokenizer('\w+')
word_lem= nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")

In [15]:
columns=np.array(['Headline','Body Text'])
train_hd_bd=labels[columns]
df2 = train_hd_bd.apply(lambda x: x.astype(str).str.lower())
keys=df2.keys()
# df2["index"]=pd.to_numeric(df2["index"])

for i in range(0,2):
    df2[f"{keys[i]}_proc"]=df2[keys[i]].apply(word_lem.lemmatize)
    df2[f"{keys[i]}_proc"]=df2[f"{keys[i]}_proc"].apply(regexp.tokenize)
    df2[f"{keys[i]}_proc"]=df2[f"{keys[i]}_proc"].apply(lambda x: [word for word in x if word not in stopwords])
# df_w2vTrain=df2.apply(lambda )
display(df2.iloc[0:5])

Unnamed: 0,Headline,Body Text,Headline_proc,Body Text_proc
0,soldier shot near canadian parliament building,a small meteorite crashed into a wooded area i...,"[soldier, shot, near, canadian, parliament, bu...","[small, meteorite, crashed, wooded, area, nica..."
1,caught a catfish record in po: 127 kg and 2.67...,a small meteorite crashed into a wooded area i...,"[caught, catfish, record, po, 127, kg, 2, 67, ...","[small, meteorite, crashed, wooded, area, nica..."
2,enormous 20-stone catfish caught with fishing ...,a small meteorite crashed into a wooded area i...,"[enormous, 20, stone, catfish, caught, fishing...","[small, meteorite, crashed, wooded, area, nica..."
3,soldier shot at war memorial in canada,a small meteorite crashed into a wooded area i...,"[soldier, shot, war, memorial, canada]","[small, meteorite, crashed, wooded, area, nica..."
4,a soldier has been shot at canada’s war memori...,a small meteorite crashed into a wooded area i...,"[soldier, shot, canada, war, memorial, steps, ...","[small, meteorite, crashed, wooded, area, nica..."


In [16]:
def extractData(df2, labels):
  label_values = labels.iloc[:, 3]
  heading = df2.iloc[: ,2]
  body = df2.iloc[:, 3]
  return heading, body, label_values

In [17]:
heading, body, label_values = extractData(df2, labels)
print(heading.shape)
print(body.shape)
print(label_values.shape)

(49972,)
(49972,)
(49972,)


# Vectorizing the Keys

In [18]:
keys1=df2.keys()

df2_w2vTrain=df2[keys1[len(keys)]]

print(range(len(keys)+1,len(keys1)))
for i in range(len(keys)+1,len(keys1)):
    df2_w2vTrain=df2_w2vTrain+df2[keys1[i]]

print(df2_w2vTrain.iloc[0:5])

range(3, 4)
0    [soldier, shot, near, canadian, parliament, bu...
1    [caught, catfish, record, po, 127, kg, 2, 67, ...
2    [enormous, 20, stone, catfish, caught, fishing...
3    [soldier, shot, war, memorial, canada, small, ...
4    [soldier, shot, canada, war, memorial, steps, ...
dtype: object


In [19]:
import gensim
from gensim.models import Word2Vec

In [20]:
model = Word2Vec(window=10, min_count=1, workers=4)
model.build_vocab(df2_w2vTrain, progress_per=1000)

In [21]:
model.train(df2_w2vTrain, total_examples=model.corpus_count, epochs=model.epochs)

(55454859, 56245705)

In [22]:
word_vec = {}
for idx, key in enumerate(model.wv.vocab):
  word_vec[key] = model.wv[key]


In [23]:
def getMaxSentLength(a):
  max_sentence_length = 0
  for i in range(len(a)):
    if (max_sentence_length < len(a[i])):
      max_sentence_length = len(a[i])

  return max_sentence_length

In [24]:
heading_sentLength = getMaxSentLength(heading)
body_sentLength = getMaxSentLength(body)
NormLength = max(heading_sentLength, body_sentLength)
print(heading_sentLength, body_sentLength)
print(NormLength)

29 2927
2927


In [25]:
import torch

In [26]:
def getVecForm(sentence, word2vecLength, word_vec, max_sentLength):
  sentence_vec = torch.zeros((max_sentLength, word2vecLength))
  for i in range(len(sentence)):
    sentence_vec[i] = torch.FloatTensor(word_vec[sentence[i]])

  return sentence_vec

In [27]:
head_body_df = pd.DataFrame(list(zip(heading, body)), columns =['Heading', 'Body'])
display(head_body_df.iloc[0:5])

Unnamed: 0,Heading,Body
0,"[soldier, shot, near, canadian, parliament, bu...","[small, meteorite, crashed, wooded, area, nica..."
1,"[caught, catfish, record, po, 127, kg, 2, 67, ...","[small, meteorite, crashed, wooded, area, nica..."
2,"[enormous, 20, stone, catfish, caught, fishing...","[small, meteorite, crashed, wooded, area, nica..."
3,"[soldier, shot, war, memorial, canada]","[small, meteorite, crashed, wooded, area, nica..."
4,"[soldier, shot, canada, war, memorial, steps, ...","[small, meteorite, crashed, wooded, area, nica..."


In [28]:
def getVecDataFrame(heading, body, label_values, NormLength, word2vecLength, word_vec):
  head_vec = torch.zeros((len(heading), NormLength, word2vecLength))
  for i in range(len(heading)):
    head_vec[i] = getVecForm(heading[i], word2vecLength, word_vec, NormLength)

  body_vec = torch.zeros((len(body), NormLength, word2vecLength))
  for i in range(len(body)):
    body_vec[i] = getVecForm(body[i], word2vecLength, word_vec, NormLength)  

  headBody_vecDF = pd.DataFrame(list(zip(head_vec, body_vec, label_values)), columns =['Heading', 'Body', 'Labels'])
  return headBody_vecDF, head_vec, body_vec

In [None]:
input_df, head_vec, body_vec = getVecDataFrame(heading, body, label_values, NormLength, 100, word_vec)
print(head_vec.shape, body_vec.shape)