# Alternus Vera - Identify Fake News

## Datasets:
Original Kaggle fake news dataset: 
'https://github.com/synle/machine-learning-sample-dataset/raw/master/liar_dataset/kaggle/kaggle-fake.csv'

#### This dataset is heavily skewed to fake news. I moved forward to try to find other dataset that enriches non-fake news.

Enriched Kaggle news dataset (50,000 verified non-fake news):
https://dock2.hyunwookshin.com/public/cmpe257_a1/articles1.csv

In [1]:
# dependencies
import pandas as pd
import nltk
import numpy as np
import io
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
# from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2
from string import punctuation
from nltk import PorterStemmer
import copy 
import re, math
from sklearn.model_selection import train_test_split
from nltk import WordNetLemmatizer
from nltk import bigrams
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.models.ldamodel import LdaModel
from gensim.models import Word2Vec, Doc2Vec
from gensim.corpora import Dictionary
import pickle
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yuxu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yuxu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yuxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yuxu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def get_parsed_data2(url):
    return pd.read_csv(io.StringIO(requests.get(url, verify=False).content.decode('utf-8')), sep=',', header='infer')

# download and parse the dataset...
data_kg_fake_news = get_parsed_data2('https://github.com/synle/machine-learning-sample-dataset/raw/master/liar_dataset/kaggle/kaggle-fake.csv')



In [3]:
data_kg_nonfake_news = get_parsed_data2('https://dock2.hyunwookshin.com/public/cmpe257_a1/articles1.csv')



# Preprocessing

In [4]:
def tokenize2(text):
    cachedStopWords = set(stopwords.words('english') + list(punctuation))
    min_length = 3
    # tokenize
    # convert to lower case
    words = map(lambda word: word.lower(), word_tokenize(text))
    # remove stop words
    words = [word for word in words if word not in cachedStopWords]
    # steming
    tokens = list(map(lambda token: PorterStemmer().stem(token), words))
    # lemmatize
    lemmas = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    # only focus on alphabetic words
    p = re.compile('[a-zA-Z]+')
    
    filtered_lemmas = list(filter(lambda lemma: p.match(lemma) and len(lemma) >= min_length, lemmas))
    return filtered_lemmas

In [5]:
data_kg_nonfake_news.rename(columns={"content": "text"}, inplace=True)
data_kg_nonfake_news['type'] = 0
data_kg_fake_news.loc[data_kg_fake_news['type']!='bs', 'type'] = 0
data_kg_fake_news.loc[data_kg_fake_news['type']=='bs', 'type'] = 1
all_data = pd.concat([data_kg_fake_news[['title','text','type']], data_kg_nonfake_news[['title','text','type']]])

In [6]:
all_data['text_clean']=all_data['text'].astype('U').apply(tokenize2)
all_data['title_clean']=all_data['title'].astype('U').apply(tokenize2)

In [7]:
all_data.type.value_counts()

0    51507
1    11492
Name: type, dtype: int64

# Word2Vec

In [8]:
model = Word2Vec(all_data.text_clean, size=50)
w2v_trained = dict(zip(model.wv.index2word, model.wv.vectors))

In [11]:
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec

    def transform(self, X):
        return X.apply(self.line_to_vec)
    
    def line_to_vec(self, line):
        row = []
        for w in line:
            if w not in self.word2vec:
                row+=[0]
            else:
                row+=[np.mean(self.word2vec[w])]
        return row  

In [12]:
embedding = EmbeddingVectorizer(w2v_trained)

all_data['text_w2v_mean'] = embedding.transform(all_data['text_clean']).apply(np.mean)
all_data['title_w2v_mean'] = embedding.transform(all_data['title_clean']).apply(np.mean)

  out=out, **kwargs)


In [13]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62999 entries, 0 to 49999
Data columns (total 7 columns):
title             62319 non-null object
text              62953 non-null object
type              62999 non-null int64
text_clean        62999 non-null object
title_clean       62999 non-null object
text_w2v_mean     62755 non-null float64
title_w2v_mean    62784 non-null float64
dtypes: float64(2), int64(1), object(4)
memory usage: 3.8+ MB


# Doc2Vec

In [14]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v, [label]))
    return labeled

In [15]:
doc2vec_text = label_sentences(all_data.text_clean, 'Text')
doc2vec_title = label_sentences(all_data.title_clean, 'Title')
doc2vec_all_data = doc2vec_text + doc2vec_title

In [16]:
model_dbow = Doc2Vec(dm=0, vector_size=50, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(doc2vec_all_data)])

100%|██████████| 125998/125998 [00:00<00:00, 2947181.86it/s]


In [17]:
from sklearn import utils

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(doc2vec_all_data)]), total_examples=len(doc2vec_all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 125998/125998 [00:00<00:00, 2715302.60it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2733151.19it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2722576.69it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2556533.95it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2867232.27it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2945457.11it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2892925.89it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2933798.45it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2938039.15it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2908016.92it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2752554.33it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2953391.20it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2944915.47it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2915621.64it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2968004.15it/s]
100%|██████████| 125998/125998 [00:00<00:00, 2906657.40it/s]
100%|██████████| 125998/

In [18]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [19]:
all_data['text_d2v_mean'] = np.mean(get_vectors(model_dbow, len(all_data.text_clean), 50, 'Text'),axis=1)
all_data['title_d2v_mean'] = np.mean(get_vectors(model_dbow, len(all_data.title_clean), 50, 'Title'),axis=1)

In [24]:
from copy import deepcopy
all_data_ = deepcopy(all_data)

In [27]:
data = all_data_[['text_w2v_mean','title_w2v_mean', 'text_d2v_mean', 'title_d2v_mean']].fillna(0)

In [None]:
# all_data.to_csv('fake_news_w2v_d2v.csv')
all_data[['text_w2v_mean','title_w2v_mean', 'text_d2v_mean', 'title_d2v_mean']].to_csv('fake_news_w2v_d2v_only.csv')

In [None]:
pd.read_csv('fake_news_w2v_d2v_only.csv')