In [1]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

In [2]:
!ls

Fake_News_Stanford_Baseline.ipynb
Independent LSTM Encoding.ipynb
data_combined.csv
fnc-1-baseline-master
fnc-1-baseline-master.zip
fnc-1-master
fnc-1-master.zip
model_plot.png
updated_combined.csv


In [3]:
DATASET_PATH = os.path.join(os.getcwd(),"fnc-1-master","")

In [4]:
print(os.listdir(DATASET_PATH))

['competition_test_bodies.csv', 'competition_test_stances.csv', 'competition_test_stances_unlabeled.csv', 'README.md', 'scorer.py', 'test_bodies.csv', 'test_stances_unlabeled.csv', 'train_bodies.csv', 'train_stances.csv', 'train_stances.random.csv']


# Problem Statement

1. We are given a dataset consisting of two csv files train_bodies.csv which contains the set of news articles bodies,while train-stances.csv resembles the articles for each of these bodies being identified using the body id.

2. After training from these samples we need to detect whether the given headline agrees,disagrees,discusses,unrelated with the body id


## Train Bodies

In [5]:
train_bodies = pd.read_csv(DATASET_PATH+'train_bodies.csv')
train_bodies.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [6]:
print('The number of rows ',train_bodies.shape[0])
print('The number of columns',train_bodies.shape[1])

The number of rows  1683
The number of columns 2


1. There are 1683 rows with two columns BodyId and article Body

In [7]:
train_stance = pd.read_csv(DATASET_PATH+'train_stances.csv')
train_stance.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [8]:
train_stance.shape

(49972, 3)

## Dataset understanding

1. The train_bodies contain the entries for the body id and associated article Body
2. The train_stances contain the entries for the headlines associated with the particular body id and its labelled stance
3. One body present in train_bodies can have multiple associated headlines present in train_stances and it's corresponding stance label
4. 1683 :- Number of article Body present
5. 49972 number of total headlines present for the 1683 different article body

In [9]:
train_bodies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1683 entries, 0 to 1682
Data columns (total 2 columns):
Body ID        1683 non-null int64
articleBody    1683 non-null object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [10]:
train_stance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49972 entries, 0 to 49971
Data columns (total 3 columns):
Headline    49972 non-null object
Body ID     49972 non-null int64
Stance      49972 non-null object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [11]:
train_stance['Body ID'].value_counts()

1921    187
1948    175
40      172
524     171
1549    166
       ... 
376       1
140       1
307       1
1066      1
59        1
Name: Body ID, Length: 1683, dtype: int64

**Observation**:
1. No null entries found in the dataset
2. Number of headlines present per article observed above 

In [12]:
##Run commented code to combine the two csv file{train_bodies.csv,train_stances.csv} into data_combined.csv file

# count=0
# for i in range(train_stance.shape[0]):
#     for j in range(train_bodies.shape[0]):
#         if train_bodies.loc[j,'Body ID']==train_stance.loc[i,'Body ID']:
#             train_stance.loc[i,'articleBody'] = train_bodies.loc[j,'articleBody']
#     if i%100==0:
#         count+=1
#         print(count,end=' ')

# train_stance.to_csv('data_combined.csv',index=False)

In [13]:
data = pd.read_csv('data_combined.csv')
data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [14]:
data['stance_cat'] = data['Stance'].map({'agree':0,'disagree':1,'discuss':2,'unrelated':3}).astype(int)
data['Stance'].value_counts()

unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: Stance, dtype: int64

## Data Preprocessing

In [15]:
stopwords_english = set(stopwords.words('english'))
data['Headline'] = data.Headline.apply(lambda x:str(x))
data.loc[:,'Headline'] = data['Headline'].apply(lambda x : str.lower(x))
data.loc[:,'Headline'] = data['Headline'].apply(lambda x:' '.join(re.findall('[\w]+',x)))
data.loc[:,'articleBody'] = data['articleBody'].apply(lambda x : str.lower(x))
data.loc[:,'articleBody'] = data['articleBody'].apply(lambda x:' '.join(re.findall('[\w]+',x)))


def remove_stopwords(s):
    return ' '.join(word for word in s.split() if word not in stopwords_english)

data['Headline'] = data['Headline'].apply(lambda x:remove_stopwords(x))
data['articleBody'] = data['articleBody'].apply(lambda x:str(x))
data['articleBody'] = data['articleBody'].apply(lambda x:remove_stopwords(x))

## Baseline Model

### Creating the two class dataset of related/unrelated

In [16]:
data['stance_base'] = data.loc[data.loc[:,'Stance']=='unrelated','Stance']
data['stance_base'] = data['stance_base'].fillna("related")
print(data['stance_base'].value_counts())

unrelated    36545
related      13427
Name: stance_base, dtype: int64


In [17]:
data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_cat,stance_base
0,police find mass graves least 15 bodies near m...,712,unrelated,danny boyle directing untitled film seth rogen...,3,unrelated
1,hundreds palestinians flee floods gaza israel ...,158,agree,hundreds palestinians evacuated homes sunday m...,0,related
2,christian bale passes role steve jobs actor re...,137,unrelated,30 year old moscow resident hospitalized wound...,3,unrelated
3,hbo apple talks 15 month apple tv streaming se...,1034,unrelated,reuters canadian soldier shot canadian war mem...,3,unrelated
4,spider burrowed tourist stomach chest,1923,disagree,fear arachnophobes story bunbury spiderman mig...,1,related


In [18]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [19]:
def add_jaccard_similarity(data):
    count=0
    for i in range(data.shape[0]):
        jaccard_lis=[];eps=0.001
        sentence = data.loc[i,'articleBody'].split('.') #per sentence scorer
        for j in range(len(sentence)):
            jaccard_lis.append(jaccard_similarity(data.loc[i,'Headline'].split(' '),sentence[j].split(' ')))
        max_jaccard_similarity = max(jaccard_lis)
        avg_jaccard_similarity = sum(jaccard_lis)/len(jaccard_lis)
        min_jaccard_similarity = min(jaccard_lis)
        data.loc[i,'jaccard_similarity'] = (max_jaccard_similarity+min_jaccard_similarity)/(max_jaccard_similarity-min_jaccard_similarity+eps)
        if i%1000==0:
            count+=1
            print("Processed {0} Headlines".format(count*1000))
add_jaccard_similarity(data)    

Processed 1000 Headlines
Processed 2000 Headlines
Processed 3000 Headlines
Processed 4000 Headlines
Processed 5000 Headlines
Processed 6000 Headlines
Processed 7000 Headlines
Processed 8000 Headlines
Processed 9000 Headlines
Processed 10000 Headlines
Processed 11000 Headlines
Processed 12000 Headlines
Processed 13000 Headlines
Processed 14000 Headlines
Processed 15000 Headlines
Processed 16000 Headlines
Processed 17000 Headlines
Processed 18000 Headlines
Processed 19000 Headlines
Processed 20000 Headlines
Processed 21000 Headlines
Processed 22000 Headlines
Processed 23000 Headlines
Processed 24000 Headlines
Processed 25000 Headlines
Processed 26000 Headlines
Processed 27000 Headlines
Processed 28000 Headlines
Processed 29000 Headlines
Processed 30000 Headlines
Processed 31000 Headlines
Processed 32000 Headlines
Processed 33000 Headlines
Processed 34000 Headlines
Processed 35000 Headlines
Processed 36000 Headlines
Processed 37000 Headlines
Processed 38000 Headlines
Processed 39000 Headl

In [20]:
data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_cat,stance_base,jaccard_similarity
0,police find mass graves least 15 bodies near m...,712,unrelated,danny boyle directing untitled film seth rogen...,3,unrelated,0.0
1,hundreds palestinians flee floods gaza israel ...,158,agree,hundreds palestinians evacuated homes sunday m...,0,related,79.545455
2,christian bale passes role steve jobs actor re...,137,unrelated,30 year old moscow resident hospitalized wound...,3,unrelated,0.0
3,hbo apple talks 15 month apple tv streaming se...,1034,unrelated,reuters canadian soldier shot canadian war mem...,3,unrelated,0.0
4,spider burrowed tourist stomach chest,1923,disagree,fear arachnophobes story bunbury spiderman mig...,1,related,28.301887


In [21]:
data.to_csv('updated_combined.csv',index=False)

## Two Class related/unrelated

In [22]:
data['stance_base'].value_counts()

unrelated    36545
related      13427
Name: stance_base, dtype: int64

In [23]:
x = data.iloc[:,-1]
y = data['stance_base']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1)
rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [24]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

X Training shape (44974,)
Y Training shape (44974,)


In [25]:
rg.fit(xtrain,ytrain)
ypred = rg.predict(xtest)
print('Accuracy score on two class agree and disagree ',accuracy_score(ypred,ytest))

Accuracy score on two class agree and disagree  0.9419767907162865


## Four Class 

In [26]:
data['Stance'].value_counts()

unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: Stance, dtype: int64

In [27]:
x = data.iloc[:,-1]
y = data['Stance']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1)
rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [28]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

X Training shape (44974,)
Y Training shape (44974,)


In [29]:
rg.fit(xtrain,ytrain)
ypred = rg.predict(xtest)
print('Accuracy score on Four class {agree,disagree,discuss,unrelated}',accuracy_score(ypred,ytest))

Accuracy score on Four class {agree,disagree,discuss,unrelated} 0.8507402961184474
