In [1]:
import numpy as np
import pandas as pd

fake_news = pd.read_csv('data\\Fake.csv')
real_news = pd.read_csv('data\\True.csv')

Create labels for real and fake news and merge them

In [2]:
fake_news['label'] = np.zeros(fake_news.shape[0])
fake_news = fake_news.drop(['date'],axis=1)
real_news['label'] = np.ones(real_news.shape[0])
real_news = real_news.drop(['date'],axis=1)
all_news = fake_news.append(real_news).reset_index(drop=True)
all_news

Unnamed: 0,title,text,subject,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,0.0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,0.0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,0.0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,0.0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,0.0
...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,1.0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,1.0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,1.0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,1.0


Let's shuffle before going further

In [3]:
from sklearn.utils import shuffle

all_news = shuffle(all_news).reset_index(drop=True)
all_news

Unnamed: 0,title,text,subject,label
0,Having nuclear weapons 'matter of life and dea...,MOSCOW (Reuters) - Pyongyang does not plan to ...,worldnews,1.0
1,WATCH: Trump’s Insult To The Troops He Didn’t...,Donald Trump has once again used images of non...,News,0.0
2,PROACTIVE PRESIDENT TRUMP Just Took Huge Step ...,"1[1pro-]: relating to, caused by, or being in...",left-news,0.0
3,Obama to announce Supreme Court nominee,WASHINGTON (Reuters) - U.S. President Barack O...,politicsNews,1.0
4,Leave It To Seth Meyers To Absolutely PUMMEL ...,It s gotten to the point that if you re still ...,News,0.0
...,...,...,...,...
44893,House Speaker Ryan mulls retirement after 2018...,WASHINGTON (Reuters) - Republican House Speake...,politicsNews,1.0
44894,TUCKER CARLSON: Why Brutal MS-13 Gang (Obama’s...,"The notoriously violent MS-13 street gang, kno...",left-news,0.0
44895,Iraqi PM orders security services 'to protect ...,BAGHDAD (Reuters) - Iraqi Prime Minister Haide...,worldnews,1.0
44896,Brazil captures most wanted arms trafficker in...,RIO DE JANEIRO (Reuters) - Brazil s most wante...,worldnews,1.0


Check for nulls...

In [4]:
all_news.isnull().sum()

title      0
text       0
subject    0
label      0
dtype: int64

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.7, min_df = 1)
feature_1 = vectorizer.fit_transform(all_news['text'])
feature_1.shape

(44898, 121987)

### Modelling

In [135]:
X = feature_1.sorted_indices()
X

<44898x121987 sparse matrix of type '<class 'numpy.float64'>'
	with 8834253 stored elements in Compressed Sparse Row format>

In [136]:
n_train = int(X.shape[0]*0.7)
n_val = int(X.shape[0]*0.85)
y = all_news['label'].values
X_train = X[:n_train]
y_train = y[:n_train]
X_val = X[n_train:n_val]
y_val = y[n_train:n_val]
X_test = X[n_val:]
y_test = y[n_val:]

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

def create_dnn(X_train,y_train,X_val,y_val):
    model = Sequential()
    model.add(Dense(100, input_shape=(X_train.shape[1],), activation='relu')) 
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train,y_train, 
          batch_size=500, 
          validation_data= (X_val, y_val),
          epochs=5, verbose=1)
    return model

In [9]:
model = create_dnn(X_train,y_train,X_val,y_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
prediction = np.round(model.predict(X_test),1)
prediction = prediction.ravel()
correct = sum(prediction == y_test)
print('model predicted %2.2f percent correctly'%(correct/len(y_test)*100))

model predicted 96.32 percent correctly


### Unsupervised learning
Lets pretend we don't know the label, what news is fake and what is real. We will now build a classifier based on claustering. 

In [127]:
from sklearn.cluster import KMeans
import re
from gensim.models import Word2Vec


#function to remove special charaters, stops and converts sentenses to arrays of strings
def clean(text):
    return re.sub("[^A-Za-z']+", ' ', text).lower().split()

# word emdedding: convert sentence arrays to vectors of length 100 using a gensim Word2Vec model
def vectorizer(text):
    try:
        vectors = model[text]
        vectors = np.average(vectors, axis=0)
    except:  # if senentence array is empty return zeros
        vectors = np.zeros(100)
    return list(vectors)

In [117]:
texts = all_news['text'].values
cleaned = list(map(clean, texts))
model = Word2Vec(cleaned, min_count=1)   # build a vector model with the tokenized string data

In [139]:
X_list = []
for row in cleaned:
    X_list.append(vectorizer(row))

  # This is added back by InteractiveShellApp.init_path()


In [143]:
X_array = np.array(X_list)
split = int(X_array.shape[0]*0.7)
X_train = X_array[:split]
X_test = X_array[split:]
y = all_news['label'].values
y_train = y[:split]
y_test = y[split:]

In [144]:
kmeans_model = KMeans(n_clusters=2,max_iter=300).fit(X_train)

In [147]:
prediction = np.round(kmeans_model.predict(X_test),1)
prediction = prediction.ravel()
correct = sum(prediction == y_test)
print('model predicted %2.2f percent correctly'%(correct/len(y_test)*100))

model predicted 86.33 percent correctly


__Not bad!__