In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('fake_or_real_news.csv')
data = data.set_index("Unnamed: 0")
data.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
data.shape

(6335, 3)

In [5]:
y = data.copy()

In [6]:
y.drop("label",axis=1)
y.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
y['text'] = y['text'].apply(lambda x: ''.join([" " if ord(i) < 32 or (ord(i) > 32 and ord(i) < 65) or (ord(i) > 90 and ord(i) < 97) or ord(i) > 122 else i for i in x]))
y['title'] = y['title'].apply(lambda x: ''.join([" " if ord(i) < 32 or (ord(i) > 32 and ord(i) < 65) or (ord(i) > 90 and ord(i) < 97) or ord(i) > 122 else i for i in x]))

#filter out numbers and special characters

In [8]:
y['text'] = y['text'].str.lower()
y['title'] = y['title'].str.lower()
y['label'] = y['label'].str.lower()
y.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,you can smell hillary s fear,daniel greenfield a shillman journalism fello...,fake
10294,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,fake
3608,kerry to go to paris in gesture of sympathy,u s secretary of state john f kerry said mon...,real
10142,bernie supporters on twitter erupt in anger ag...,kaydee king kaydeeking november t...,fake
875,the battle of new york why this primary matters,it s primary day in new york and front runners...,real


In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') # You may need to run this first
stop_array = (stopwords.words('english'))
#set(stopwords.words('english'))

# Remove stop words
y['text'] = y['text'].apply(lambda row: ' '.join([" " if word in stop_array else word for word in row.split()]))
y['title'] = y['title'].apply(lambda row: ' '.join([" " if word in stop_array else word for word in row.split()]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mazhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Delete all non-English words. This is a problem because it also deletes names not in the corpus
# Could keep as an option to see if it increases accuracy
from nltk.corpus import words
nltk.download('words')

opt = y.copy()
words = set(words.words())
opt['text'] = opt['text'].apply(lambda row: ' '.join([" " if w not in words else w for w in row.split()]))
opt['title'] = opt['title'].apply(lambda row: ' '.join([" " if w not in words else w for w in row.split()]))
opt.head()

[nltk_data] Downloading package words to /Users/mazhang/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
opt.head()
# TODO: Might need to lemmatize words before vectorizing them

In [None]:
y2 = y.copy()
# With text
X_train, X_test, y_train, y_test = train_test_split(y['text'], y2, test_size=0.33, random_state=53)

# With headlines
X_train2, X_test2, y_train2, y_test2 = train_test_split(y['title'], y2, test_size=0.33, random_state=53)

In [None]:
# Use tf-idf and count vectorizer to find most relevant words in corpus
# TfidfVectorizer should remove stop words and words that appear in more than 70% of the articles

# Vectorizers have an ngram range! Check if bigrams will improve accuracy
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english') # already gets rid of stop words
count_train = count_vectorizer.fit_transform(X_train) 
count_test = count_vectorizer.transform(X_test)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)  
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
print(tfidf_vectorizer.get_feature_names()[-10:])
print(count_vectorizer.get_feature_names()[:10])

In [None]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
count_df.head()

In [None]:
count_df.describe()

In [None]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_df.describe()

In [None]:
print(count_df.equals(tfidf_df)) # check if the vectorizers extracted different tokens

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# MultinomialNB
multi_nb = MultinomialNB()
np.shape(tfidf_train)
np.shape(y_train)
multi_nb.fit(tfidf_train, y_train['label'])
pred = multi_nb.predict(tfidf_test)
score = accuracy_score(y_test['label'], pred)
print("accuracy:   %0.3f" % score) # Multinomial Naive Bayes accuracy: 0.857

In [None]:
# Use a confusion matrix to compare accuracy
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test['label'], pred, labels=['fake', 'real'])

In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(cm, classes=['fake', 'real'])
plt.show()

In [None]:
count_multi_nb = MultinomialNB()
count_multi_nb.fit(count_train,y_train['label'])
c_pred = count_multi_nb.predict(count_test)
c_score = accuracy_score(y_test['label'],pred)
print("accuracy:   %0.3f" % c_score)
# this isn't outperforming tf-idf for some reason

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(n_iter=50)
linear_clf.fit(tfidf_train, y_train['label'])
pred = linear_clf.predict(tfidf_test)
linear_score = accuracy_score(y_test['label'], pred)
print("accuracy:   %0.3f" % linear_score)
linear_cm = confusion_matrix(y_test['label'], pred, labels=['fake', 'real'])
plot_confusion_matrix(linear_cm, classes=['fake', 'real'])
plt.show()

In [None]:
clf = MultinomialNB(alpha=0.1)
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train['label'])
    pred = nb_classifier.predict(tfidf_test)
    score = accuracy_score(y_test['label'], pred)
    if score > last_score:
        clf = nb_classifier
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

In [None]:
def most_informative(vectorizer, classifier, n=100):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
    for coef,feat in topn_class1:
        print(class_labels[0],coef,feat)    
    print()
    for coef,feat in reversed(topn_class2):
        print(class_labels[1],coef,feat)

In [None]:
most_informative(tfidf_vectorizer,linear_clf,n=30)

In [None]:
# Most real
feature_names = tfidf_vectorizer.get_feature_names()
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20] # zip coefficients and sort them

In [None]:
# Most fake
sorted(zip(clf.coef_[0], feature_names))[:20]

In [None]:
tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)

In [None]:
clf = MultinomialNB(alpha=.01)
clf.fit(hash_train, y_train['label'])
pred = clf.predict(hash_test)
score = accuracy_score(y_test['label'], pred)
print("accuracy:   %0.3f" % score)
cm = confusion_matrix(y_test['label'], pred, labels=['fake', 'real'])
plot_confusion_matrix(cm, classes=['fake', 'real'])
plt.show()

In [None]:
# defining fake news with simple bag-of-words or TF-IDF vectors is an oversimplified approach

In [None]:
# look up how to make an ROC curve

In [None]:
# TODO: Other types of Naive Bayes, Logistic Regression, and KNN

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_clf = LogisticRegression()
logistic_clf.fit(tfidf_train, y_train['label'])
pred = logistic_clf.predict(tfidf_test)
logistic_score = accuracy_score(y_test['label'], pred)
print("accuracy:   %0.3f" % logistic_score)
logistic_cm = confusion_matrix(y_test['label'], pred, labels=['fake', 'real'])
plot_confusion_matrix(logistic_cm, classes=['fake', 'real'])
plt.show()

In [None]:
# Visualizations of fake news categories
# Features most associated with the different fake news categories
# Domain names, sentiment analysis

In [None]:
dict_sample = y['text'][0:y.shape[0]].str.cat(sep=' ') # Dump sample vocab for topic modeling

In [None]:
f = open('articles.txt','w',encoding='utf-8')
f.write(dict_sample)
f.close()

In [None]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [None]:
sent = LineSentence('articles.txt')

# learn the dictionary 
article_dict = Dictionary(sent)
    
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
article_dict.filter_extremes(no_below=5, no_above=0.2)
article_dict.compactify()

article_dict.save('articles.dict')
    
# load the finished dictionary from disk
article_dict = Dictionary.load('articles.dict')

In [None]:
def bow(filepath,d): # output bag of words representation
    for review in LineSentence(filepath):
        yield d.doc2bow(review)

In [None]:
# generate bag-of-words representations for all reviews and save them as a matrix
MmCorpus.serialize('articles.mm',
                       bow('articles.txt',article_dict))
    
# load the finished bag-of-words corpus from disk
corpus = MmCorpus('articles.mm')

In [None]:
# Create LDA model (Runs outside of Jupyter for some reason)
# lda = LdaMulticore(corpus,num_topics=10,
#                    id2word=article_dict, 
#                    workers=2)
    
# lda.save('./lda_model')

In [None]:
lda = LdaMulticore.load('./lda_model')

In [None]:
def topics (topic_number, lda_model, topn=5):
    print(u'{:10} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda_model.show_topic(topic_number, topn=25):
        print(u'{:10} {:.3f}'.format(term, round(frequency, 3)))

In [None]:
topics(2,lda)

In [None]:
# Ignore warning
# LDA is a transformation from bag-of-words counts into a topic space of lower dimensionality. It can be thought of as probability distributions over words.
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, corpus,
                                              article_dict)

In [None]:
# Visualization for general corpus
pyLDAvis.display(LDAvis_prepared)

In [None]:
fake = y.copy()
fake = fake.loc[fake['label'] == 'fake']
real = y.copy()
real = real.loc[real['label'] == 'real']
fake_text = fake['text'][0:fake.shape[0]].str.cat(sep=' ')
real_text = real['text'][0:real.shape[0]].str.cat(sep=' ')

In [None]:
f1 = open('fake.txt','w',encoding='utf-8')
f1.write(fake_text)
f1.close()
f2 = open('real.txt','w',encoding='utf-8')
f2.write(real_text)

In [None]:
fake_sent = LineSentence('fake.txt')
fake_dict = Dictionary(fake_sent)
fake_dict.filter_extremes(no_below=5, no_above=0.2)
fake_dict.compactify()
fake_dict.save('fake.dict')
fake_dict = Dictionary.load('fake.dict')

real_sent = LineSentence('real.txt')
real_dict = Dictionary(real_sent)
real_dict.filter_extremes(no_below=5, no_above=0.2)
real_dict.compactify()
real_dict.save('real.dict')
real_dict = Dictionary.load('real.dict')

In [None]:
MmCorpus.serialize('fake.mm', bow('fake.txt',fake_dict))
fake_corpus = MmCorpus('fake.mm')
MmCorpus.serialize('real.mm', bow('real.txt',real_dict))
real_corpus = MmCorpus('real.mm')

In [None]:
# Might have to run this outside of Jupyter
# fake_lda = LdaMulticore(fake_corpus,num_topics=10,id2word=fake_dict,workers=2)  
# fake_lda.save('./fake_lda_model')

# real_lda = LdaMulticore(real_corpus,num_topics=10,id2word=real_dict,workers=2)
# real_lda.save('./real_lda_model')

In [None]:
fake_lda = LdaMulticore.load('./fake_lda_model')
real_lda = LdaMulticore.load('./real_lda_model')

In [None]:
topics(2, fake_lda)

In [None]:
topics(2, real_lda)

In [None]:
fake_LDAvis = pyLDAvis.gensim.prepare(fake_lda, fake_corpus, fake_dict)
real_LDAvis = pyLDAvis.gensim.prepare(real_lda, real_corpus, real_dict)

In [None]:
pyLDAvis.display(fake_LDAvis)

In [None]:
pyLDAvis.display(real_LDAvis)

In [None]:
# Save HTML files
pyLDAvis.save_html(fake_LDAvis,'fake_lda.html')
pyLDAvis.save_html(real_LDAvis,'real_lda.html')
pyLDAvis.save_html(LDAvis_prepared,'lda.html')

In [None]:
fake_dataset = pd.read_csv('fake.csv')

In [None]:
fake_dataset.head(5)

In [None]:
fake_dataset.shape

In [None]:
fake_dataset.type.unique()

In [None]:
fake_dataset.groupby(['type']).size().plot(kind='barh')
plt.show()

In [None]:
# Remove all stop words from larger corpora
# This will take a lot of time because there's a lot of data

for token in headline_text:
    if token in stop_array:
        headline_text.remove(token)
        
for token in article_text:
    if token in stop_array:
        article_text.remove(token)

In [None]:
# POS tag and then stem


In [None]:
# Lemmatize the words
# Might need to tag parts of speech first and lemmatize based on that
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

headline_lemma = []
article_lemma = []

for token in headline_text:
    headline_lemma.append(lemmatizer.lemmatize(token))

for token in article_text:
    article_lemma.append(lemmatizer.lemmatize(token))

In [None]:
from nltk.probability import FreqDist
# Parse out words that appear too often

fdist1 = FreqDist(headline_text)
fdist1

In [None]:
from sklearn.pipeline import make_pipeline
make_pipeline(CountVectorizer(stop_words='english'),MultinomialNB())

In [None]:
# Convert into dummy variables
copy = tfidf_df.copy()
#copy['label'] = copy['label'].apply(lambda x: 1 if x == 'FAKE' else 0)

# Import into a pickle model
from sklearn.externals import joblib
joblib.dump(multi_nb, 'model.pkl')
model_columns = list(copy.columns)
joblib.dump(model_columns, 'model_columns.pkl')