In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

We'll load our data into Spark for processing.

In [2]:
sc=SparkSession.builder.getOrCreate()

DF_True=sc.read.csv('True.csv',header=True,inferSchema=True)
DF_Fake=sc.read.csv('Fake.csv',header=True,inferSchema=True)

Add 'Truthness' column--flagging the value of the article.

In [3]:
DF_True=DF_True.withColumn('Truthness',lit(1.0))
DF_Fake=DF_Fake.withColumn('Truthness',lit(0.0))
DF_True.show(5)
DF_Fake.show(5)

+--------------------+--------------------+------------+------------------+---------+
|               title|                text|     subject|              date|Truthness|
+--------------------+--------------------+------------+------------------+---------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |      1.0|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |      1.0|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |      1.0|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |      1.0|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |      1.0|
+--------------------+--------------------+------------+------------------+---------+
only showing top 5 rows

+--------------------+--------------------+-------+-----------------+---------+
|               title|                text|subject|             date|Truthness|
+--------------------+-------------------

Combine into single dataframe, remove rows with '"' as title (since their rows are null also), and replace remaining null values with empty strings.

Restrict to 'title', 'text', and 'Truthness' columns, since we decided to remove 'subject' and 'date' for bias.

In [4]:
DF=DF_True.union(DF_Fake)
DF=DF.filter(DF['title']!='"')
DF=DF.fillna('')
DF=DF.select(['title','text','Truthness'])
DF.sample(False,.0002).show()

+--------------------+--------------------+---------+
|               title|                text|Truthness|
+--------------------+--------------------+---------+
|Connecticut attor...|WASHINGTON (Reute...|      1.0|
|"Collins says Sen...|WASHINGTON (Reute...|      1.0|
|Factbox: Republic...|(Reuters) - After...|      1.0|
|No new U.S. admin...|WASHINGTON (Reute...|      1.0|
|Republican foreig...|WASHINGTON (Reute...|      1.0|
|Crime, casualties...|KUNDUZ, Afghanist...|      1.0|
|Congo sets presid...|KINSHASA, (Reuter...|      1.0|
|Rescue efforts en...|MUMBAI (Reuters) ...|      1.0|
| Kushner: I Lied ...|Donald Trump s so...|      0.0|
| Dr. Drew: Trump ...|Dr. Drew Pinsky k...|      0.0|
| Paul Ryan Though...|House Speaker Pau...|      0.0|
| Watch Dan Savage...|Ann Coulter is on...|      0.0|
|MUSLIM ASSIMILATI...|This is a story i...|      0.0|
|BUCKLE UP: 2008 W...|A fake economy ca...|      0.0|
|OOPS…CDC EMPLOYEE...|I guess we re not...|      0.0|
|WATCH TRUMP Call ...|In a p

Tokenize and remove filteredwords below.

In [5]:
filteredwords = ["(Reuters)","-","Trump's",'"I','"The','"We']

titleTokenizer = RegexTokenizer(inputCol='title', outputCol='title_token')
textTokenizer = RegexTokenizer(inputCol='text', outputCol='text_token')

stopwords = StopWordsRemover(inputCol='text_token', outputCol='textfiltered', stopWords=filteredwords)

pipeline_etl = Pipeline(stages=[titleTokenizer,textTokenizer,stopwords])
DF = pipeline_etl.fit(DF).transform(DF)

In [6]:
DF.show()

+--------------------+--------------------+---------+--------------------+--------------------+--------------------+
|               title|                text|Truthness|         title_token|          text_token|        textfiltered|
+--------------------+--------------------+---------+--------------------+--------------------+--------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|      1.0|[as, u.s., budget...|[washington, (reu...|[washington, the,...|
|U.S. military to ...|WASHINGTON (Reute...|      1.0|[u.s., military, ...|[washington, (reu...|[washington, tran...|
|Senior U.S. Repub...|WASHINGTON (Reute...|      1.0|[senior, u.s., re...|[washington, (reu...|[washington, the,...|
|FBI Russia probe ...|WASHINGTON (Reute...|      1.0|[fbi, russia, pro...|[washington, (reu...|[washington, trum...|
|Trump wants Posta...|SEATTLE/WASHINGTO...|      1.0|[trump, wants, po...|[seattle/washingt...|[seattle/washingt...|
|White House, Cong...|WEST PALM BEACH, ...|      1.0|[white, hou

Pandas ETL for the neural network:

(here we'll just grab the 'text' body and exclude the 'title')

In [6]:
filteredwords = ['(Reuters)','-',"Trump's",'"I','"The','"We']

# load, add 'Truthness', concatenate
real = pd.read_csv('True.csv')
real['Truthness'] = 1
fake = pd.read_csv('Fake.csv')
fake['Truthness'] = 0
DF_Pandas = pd.concat([real,fake])
del real,fake

# remove filteredwords
for word in filteredwords:
    DF_Pandas['text']=DF_Pandas['text'].str.replace(word,'')

# split data here
train, test = train_test_split(DF_Pandas[['text','Truthness']], test_size=0.2, random_state=3)
del DF_Pandas

# split between text and label
train_x = train['text']
train_y = train['Truthness']
test_x = test['text']
test_y = test['Truthness']
del train, test

# tokenize top words
maxwords = 10000
tokenizer = Tokenizer(num_words=maxwords, lower=True, split=' ')
tokenizer.fit_on_texts(train_x.values)

  DF_Pandas['text']=DF_Pandas['text'].str.replace(word,'')
