In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, regexp_tokenize, word_tokenize
import sys
import os
import string
import re

import torch
import torch.nn as nn

from utils import dataset


### Using the included data importer

In [2]:
train_data = dataset.DataSet(name='train')

Reading dataset
Total stances: 49972
Total bodies: 1683


In [3]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Converting imported data into pandas dataframes

In [52]:
train_data_articles_df = pd.DataFrame({'Article': train_data.articles.values(), 'Body ID':train_data.articles.keys()} , index=train_data.articles.keys())
train_data_stances_df = pd.DataFrame(train_data.stances)

# checking out example output
print(train_data_articles_df['Article'][158] + '\n')
print(train_data_stances_df['Headline'][158] + '\n')
print(train_data_stances_df['Stance'][158])

Hundreds of Palestinians were evacuated from their homes Sunday morning after Israeli authorities opened a number of dams near the border, flooding the Gaza Valley in the wake of a recent severe winter storm.

The Gaza Ministry of Interior said in a statement that civil defense services and teams from the Ministry of Public Works had evacuated more than 80 families from both sides of the Gaza Valley (Wadi Gaza) after their homes flooded as water levels reached more than three meters.

Gaza has experienced flooding in recent days amid a major storm that saw temperatures drop and frigid rain pour down.

The storm displaced dozens and caused hardship for tens of thousands, including many of the approximately 110,000 Palestinians left homeless by Israel's assault over summer.

The suffering is compounded by the fact that Israel has maintained a complete siege over Gaza for the last eight years, severely limiting electricity and the availability of fuel for generators. It has also prevented

### Preprocessing: Tokenize and remove stopwords. 

Can do more, obvious ones: remove news agency header, remove website URL, remove twitter usernames, remove dates, stemming, lemmatizating

In [97]:
pattern = r'\b[^\d\W_/-]+\b'
tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b')
train_data_articles_df['article_cleaned'] = train_data_articles_df['Article'].apply(lambda x: tokenizer.tokenize(x.lower()))
train_data_stances_df['headline_cleaned'] = train_data_stances_df['Headline'].apply(lambda x: tokenizer.tokenize(x.lower()))

# remove_whitespace = r'\s+'
# train_data_articles_df['article_cleaned'] = train_data_articles_df['Article'].apply(lambda x: re.split(remove_whitespace, x))
# train_data_stances_df['headline_cleaned'] = train_data_stances_df['Headline'].apply(lambda x: re.split(remove_whitespace, x))

# exclude = r'[^/d/W]+'
# train_data_articles_df['article_cleaned'] = train_data_articles_df['article_cleaned'].apply(lambda x: re.findall(exclude, x))
# train_data_stances_df['headline_cleaned'] = train_data_stances_df['headline_cleaned'].apply(lambda x: re.findall(exclude, x))




In [98]:
print(train_data_articles_df['Article'][154])
print(train_data_articles_df['article_cleaned'][154])
print(train_data_stances_df['headline_cleaned'][154])
print(train_data_stances_df['Stance'][154])

Thousands of people have been duped by a fake news story claiming that Nasa has forecast a total blackout of earth for six days in December.

The story, entitled "Nasa Confirms Earth Will Experience 6 Days of Total Darkness in December 2014!" originated from Huzlers.com, a website well known for publishing fake stories with sensational headlines.

The bogus report read: "Nasa has confirmed that the Earth will experience 6 days of almost complete darkness and will happen from the dates Tuesday the 16 – Monday the 22 in December. The world will remain, during these three days, without sunlight due to a solar storm, which will cause dust and space debris to become plentiful and thus, block 90% sunlight.

"The head of Nasa Charles Bolden who made the announcement and asked everyone to remain calm. This will be the product of a solar storm, the largest in the last 250 years for a period of 216 hours total.

"Despite the six days of darkness soon to come, officials say that the earth will no

In [101]:
my_banned_words = stopwords.words('english') + ['twitter', 'com', 'pic', 'co']


In [102]:
def remove_stopwords(string):
    output = []
    for word in string:
        if word not in stopwords.words('english'):
            output.append(word)
    return output

train_data_articles_df['article_cleaned'] =  train_data_articles_df['article_cleaned'].apply(lambda x: remove_stopwords(x))
train_data_stances_df['headline_cleaned'] =  train_data_stances_df['headline_cleaned'].apply(lambda x: remove_stopwords(x))


In [103]:
print(train_data_articles_df['article_cleaned'][15])


['fucking', 'cheat', 'cassy', 'aka', 'really', 'likes', 'trey', 'songz', 'finds', 'everything', 'dont', 'fucking', 'cheat', 'find', 'everything', 'pic', 'twitter', 'com', 'cassy', 'december', 'cassy', 'claims', 'via', 'series', 'photos', 'discovered', 'incriminating', 'twitter', 'dms', 'boyfriend', 'another', 'woman', 'proving', 'unfaithful', 'instead', 'confronting', 'immediately', 'planned', 'little', 'christmas', 'surprise', 'printed', 'evidence', 'wrapped', 'gave', 'present', 'thought', 'getting', 'gifts', 'nah', 'appears', 'still', 'video', 'moment', 'trap', 'sprung', 'although', 'shamed', 'cheating', 'dude', 'snapchat', 'twitter', 'posted', 'video', 'evidence', 'could', 'faked', 'pretty', 'easily', 'hand', 'cassy', 'tweets', 'prove', 'real', 'person', 'guy', 'photo', 'seems', 'fessed', 'could', 'mention', 'would', 'gladly', 'retweeted', 'al', 'ighty', 'bugatti_boii__', 'december', 'even', 'real', 'satisfying', 'tale', 'revenge', 'people', 'retweeted', 'well', 'ends', 'well', 'gue

In [104]:
if not os.path.exists('processed'):
    os.mkdir('processed')

train_data_articles_df.to_csv("processed/processed_train_articles.csv")
train_data_stances_df.to_csv("processed/processed_train_stances.csv")