Datasets:
Original Kaggle fake news dataset: 'https://github.com/synle/machine-learning-sample-dataset/raw/master/liar_dataset/kaggle/kaggle-fake.csv'

This dataset is heavily skewed to fake news. I moved forward to try to find other dataset that enriches non-fake news.
Enriched Kaggle news dataset (50,000 verified non-fake news): https://dock2.hyunwookshin.com/public/cmpe257_a1/articles1.csv

In [0]:
!pip install gensim



In [0]:
# dependencies
import pandas as pd
import nltk
import numpy as np
import io
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
# from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2
from string import punctuation
from nltk import PorterStemmer
import copy 
import re, math
from sklearn.model_selection import train_test_split
from nltk import WordNetLemmatizer
from nltk import bigrams
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import pickle
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
def get_parsed_data2(url):
    return pd.read_csv(io.StringIO(requests.get(url, verify=False).content.decode('utf-8')), sep=',', header='infer')

# download and parse the dataset...
data_kg_fake_news = get_parsed_data2('https://github.com/synle/machine-learning-sample-dataset/raw/master/liar_dataset/kaggle/kaggle-fake.csv')



In [0]:
data_kg_nonfake_news = get_parsed_data2('https://dock2.hyunwookshin.com/public/cmpe257_a1/articles1.csv')



In [0]:
def tokenize2(text):
    cachedStopWords = set(stopwords.words('english') + list(punctuation))
    min_length = 3
    # tokenize
    # convert to lower case
    words = map(lambda word: word.lower(), word_tokenize(text))
    # remove stop words
    words = [word for word in words if word not in cachedStopWords]
    # steming
    tokens = list(map(lambda token: PorterStemmer().stem(token), words))
    # lemmatize
    lemmas = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    # only focus on alphabetic words
    p = re.compile('[a-zA-Z]+')
    
    filtered_lemmas = list(filter(lambda lemma: p.match(lemma) and len(lemma) >= min_length, lemmas))
    return filtered_lemmas

In [0]:
data_kg_nonfake_news.rename(columns={"content": "text"}, inplace=True)
data_kg_nonfake_news['type'] = 0
data_kg_fake_news.loc[data_kg_fake_news['type']!='bs', 'type'] = 0
data_kg_fake_news.loc[data_kg_fake_news['type']=='bs', 'type'] = 1
all_data = pd.concat([data_kg_fake_news[['title','text','type']], data_kg_nonfake_news[['title','text','type']]])

In [0]:
all_data['text_clean']=all_data['text'].astype('U').apply(tokenize2)
all_data['title_clean']=all_data['title'].astype('U').apply(tokenize2)

In [0]:
all_data.type.value_counts()

0    51507
1    11492
Name: type, dtype: int64

In [0]:
all_data

Unnamed: 0,title,text,type,text_clean,title_clean
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,0,"[print, pay, back, money, plu, interest, entir...","[muslim, bust, stole, million, gov, benefit]"
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,0,"[attorney, gener, loretta, lynch, plead, fifth...","[attorney, gener, loretta, lynch, plead, fifth]"
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,0,"[red, state, fox, news, sunday, report, morn, ...","[break, weiner, cooper, fbi, hillari, email, i..."
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,0,"[email, kayla, mueller, prison, tortur, isi, c...","[pin, drop, speech, father, daughter, kidnap, ..."
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0,"[email, healthcar, reform, make, america, grea...","[fantast, trump, point, plan, reform, healthca..."
5,Hillary Goes Absolutely Berserk On Protester A...,Print Hillary goes absolutely berserk! She exp...,0,"[print, hillari, goe, absolut, berserk, explod...","[hillari, goe, absolut, berserk, protest, rall..."
6,BREAKING! NYPD Ready To Make Arrests In Weiner...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0,"[break, nypd, readi, make, arrest, weiner, cas...","[break, nypd, readi, make, arrest, weiner, cas..."
7,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0,"[break, nypd, readi, make, arrest, weiner, cas...","[wow, whistleblow, tell, chill, stori, massiv,..."
8,BREAKING: CLINTON CLEARED...Was This A Coordin...,\nLimbaugh said that the revelations in the Wi...,0,"[limbaugh, said, revel, wikileak, materi, star...","[break, clinton, clear, coordin, last, minut, ..."
9,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",Email \nThese people are sick and evil. They w...,0,"[email, peopl, sick, evil, stop, noth, get, wa...","[evil, hillari, support, yell, f*ck, trump, tr..."


In [0]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
sid = SentimentIntensityAnalyzer()

def sentiment_analyzer(topics):
    score = sid.polarity_scores(" ".join(topics))
    return pd.Series(score)

all_data[['title_senti_neg','title_senti_neu','title_senti_pos','title_senti_cmpd' ] ]= all_data['title_clean'].apply(sentiment_analyzer)
all_data[['text_senti_neg','text_senti_neu','text_senti_pos','text_senti_cmp']]=all_data['text_clean'].apply(sentiment_analyzer)

In [0]:
all_data

Unnamed: 0,title,text,type,text_clean,title_clean,title_senti_neg,title_senti_neu,title_senti_pos,title_senti_cmpd,text_senti_neg,text_senti_neu,text_senti_pos,text_senti_cmp
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,0,"[print, pay, back, money, plu, interest, entir...","[muslim, bust, stole, million, gov, benefit]",0.4588,0.000,0.625,0.375,-0.3400,0.209,0.606,0.185
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,0,"[attorney, gener, loretta, lynch, plead, fifth...","[attorney, gener, loretta, lynch, plead, fifth]",0.0000,0.000,1.000,0.000,-0.2960,0.063,0.887,0.050
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,0,"[red, state, fox, news, sunday, report, morn, ...","[break, weiner, cooper, fbi, hillari, email, i...",0.0000,0.000,1.000,0.000,0.8957,0.021,0.871,0.108
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,0,"[email, kayla, mueller, prison, tortur, isi, c...","[pin, drop, speech, father, daughter, kidnap, ...",-0.7783,0.430,0.570,0.000,0.8316,0.133,0.517,0.350
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0,"[email, healthcar, reform, make, america, grea...","[fantast, trump, point, plan, reform, healthca...",0.0000,0.000,1.000,0.000,0.9517,0.066,0.765,0.170
5,Hillary Goes Absolutely Berserk On Protester A...,Print Hillary goes absolutely berserk! She exp...,0,"[print, hillari, goe, absolut, berserk, explod...","[hillari, goe, absolut, berserk, protest, rall...",-0.2500,0.250,0.750,0.000,-0.9936,0.352,0.618,0.030
6,BREAKING! NYPD Ready To Make Arrests In Weiner...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0,"[break, nypd, readi, make, arrest, weiner, cas...","[break, nypd, readi, make, arrest, weiner, cas...",-0.3400,0.107,0.893,0.000,-0.9559,0.103,0.813,0.084
7,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0,"[break, nypd, readi, make, arrest, weiner, cas...","[wow, whistleblow, tell, chill, stori, massiv,...",-0.4588,0.317,0.528,0.154,-0.9836,0.138,0.844,0.019
8,BREAKING: CLINTON CLEARED...Was This A Coordin...,\nLimbaugh said that the revelations in the Wi...,0,"[limbaugh, said, revel, wikileak, materi, star...","[break, clinton, clear, coordin, last, minut, ...",0.3400,0.102,0.678,0.220,0.1027,0.044,0.902,0.054
9,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",Email \nThese people are sick and evil. They w...,0,"[email, peopl, sick, evil, stop, noth, get, wa...","[evil, hillari, support, yell, f*ck, trump, tr...",-0.4019,0.243,0.608,0.149,-0.8402,0.151,0.809,0.040


In [0]:
all_data[['title','text','text_clean','title_clean','title_senti_neg','title_senti_neu', 'title_senti_pos', 'title_senti_cmpd','text_senti_neg','text_senti_neu','text_senti_pos','text_senti_cmp']].to_csv('all_news_sentiment.csv')

In [0]:
from google.colab import files
files.download('all_news_sentiment.csv')

----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 46666, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 721, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

In [0]:
dictionary = Dictionary(all_data['title_clean']+ all_data['text_clean'])

This is running slow and did not include in csv as no success even waiting long time 

In [0]:
def get_topics (words_tokenized):
  doc_bow = [dictionary.doc2bow(words_tokenized)]
  model_ = LdaModel(doc_bow , num_topics=1 , id2word=dictionary , passes=10)
  return[topic.replace('"','' )for topic in re.findall(r'"\w+"', list(*model_.print_topics(num_words=5))[1])]

In [0]:
all_data['text_topics']=all_data['text_clean'].apply(get_topics)
all_data['title_topics']=all_data['title_clean'].apply(get_topics)

In [0]:
all_data

In [0]:
import re, math
from collections import Counter
#inner product of 2 none zero vectors (dot product)
def get_cosine(words1, words2):
     vec1= Counter(words1)
     vec2= Counter(words2)
     intersection = vec1,vec2
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

all_data['title_text_cosine_similiar']=all_data.apply(lambda x: get_cosine(x['title_topics'],x['text_topics']),axis=1)

In [0]:
all_data


Unnamed: 0,title,text,type,text_clean,title_clean,title_senti_neg,title_senti_neu,title_senti_pos,title_senti_cmpd,text_senti_neg,text_senti_neu,text_senti_pos,text_senti_cmp
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,0,"[print, pay, back, money, plu, interest, entir...","[muslim, bust, stole, million, gov, benefit]",0.4588,0.000,0.625,0.375,-0.3400,0.209,0.606,0.185
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,0,"[attorney, gener, loretta, lynch, plead, fifth...","[attorney, gener, loretta, lynch, plead, fifth]",0.0000,0.000,1.000,0.000,-0.2960,0.063,0.887,0.050
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,0,"[red, state, fox, news, sunday, report, morn, ...","[break, weiner, cooper, fbi, hillari, email, i...",0.0000,0.000,1.000,0.000,0.8957,0.021,0.871,0.108
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,0,"[email, kayla, mueller, prison, tortur, isi, c...","[pin, drop, speech, father, daughter, kidnap, ...",-0.7783,0.430,0.570,0.000,0.8316,0.133,0.517,0.350
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0,"[email, healthcar, reform, make, america, grea...","[fantast, trump, point, plan, reform, healthca...",0.0000,0.000,1.000,0.000,0.9517,0.066,0.765,0.170
5,Hillary Goes Absolutely Berserk On Protester A...,Print Hillary goes absolutely berserk! She exp...,0,"[print, hillari, goe, absolut, berserk, explod...","[hillari, goe, absolut, berserk, protest, rall...",-0.2500,0.250,0.750,0.000,-0.9936,0.352,0.618,0.030
6,BREAKING! NYPD Ready To Make Arrests In Weiner...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0,"[break, nypd, readi, make, arrest, weiner, cas...","[break, nypd, readi, make, arrest, weiner, cas...",-0.3400,0.107,0.893,0.000,-0.9559,0.103,0.813,0.084
7,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0,"[break, nypd, readi, make, arrest, weiner, cas...","[wow, whistleblow, tell, chill, stori, massiv,...",-0.4588,0.317,0.528,0.154,-0.9836,0.138,0.844,0.019
8,BREAKING: CLINTON CLEARED...Was This A Coordin...,\nLimbaugh said that the revelations in the Wi...,0,"[limbaugh, said, revel, wikileak, materi, star...","[break, clinton, clear, coordin, last, minut, ...",0.3400,0.102,0.678,0.220,0.1027,0.044,0.902,0.054
9,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",Email \nThese people are sick and evil. They w...,0,"[email, peopl, sick, evil, stop, noth, get, wa...","[evil, hillari, support, yell, f*ck, trump, tr...",-0.4019,0.243,0.608,0.149,-0.8402,0.151,0.809,0.040
