In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
sc = SparkContext()

In [5]:
spark = SparkSession(sc)

### Read the dataset

In [6]:
# Read fake.csv
fake = spark.read.csv('Du_lieu_cung_cap/fake-and-real-news-dataset/Fake.csv', header=True, inferSchema=True)

In [7]:
fake.show(5)

+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
+--------------------+--------------------+-------+-----------------+
only showing top 5 rows



In [8]:
# Add 'class' column
fake = fake.withColumn('class', lit('fake'))
fake.show(3)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|class|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017| fake|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017| fake|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017| fake|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 3 rows



In [9]:
fake.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- class: string (nullable = false)



In [10]:
fake.count()

23489

In [11]:
# Read fake.csv
true = spark.read.csv('Du_lieu_cung_cap/fake-and-real-news-dataset/True.csv', header=True, inferSchema=True)

In [12]:
true.show(5)

+--------------------+--------------------+------------+------------------+
|               title|                text|     subject|              date|
+--------------------+--------------------+------------+------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |
+--------------------+--------------------+------------+------------------+
only showing top 5 rows



In [13]:
# Add 'class' column
true = true.withColumn('class', lit('true'))
true.show(3)

+--------------------+--------------------+------------+------------------+-----+
|               title|                text|     subject|              date|class|
+--------------------+--------------------+------------+------------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 | true|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 | true|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 | true|
+--------------------+--------------------+------------+------------------+-----+
only showing top 3 rows



In [14]:
true.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- class: string (nullable = false)



In [15]:
true.count()

21417

In [16]:
# Concate 2 dataframes 
data = fake.union(true)

In [17]:
# Count number of rows
data.count()

44906

In [18]:
# Count number of each groups
data.groupBy('class').count().show()

+-----+-----+
|class|count|
+-----+-----+
| fake|23489|
| true|21417|
+-----+-----+



### Clean and Prepare the data

In [19]:
data = data.select('text', 'class')

In [20]:
# Check null
data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns]).toPandas()

Unnamed: 0,text,class
0,8,0


In [21]:
data.select([count(when(isnan(c), c)).alias(c) for c in data.columns]).toPandas()

Unnamed: 0,text,class
0,0,0


In [22]:
# Drop null value
data = data.dropna()
data.count()

44898

In [23]:
# Check duplicates
dup_rows = data.count() - data.distinct().count()
dup_rows

6253

- Có dữ liệu bị duplicated

In [24]:
# Drop duplicates
data = data.drop_duplicates()
data.count()

38645

In [25]:
# Create column 'length'
data = data.withColumn('length', length(data['text']))
data.show(5)

+--------------------+-----+------+
|                text|class|length|
+--------------------+-----+------+
|Sunday morning, a...| fake|  2857|
|This is yet anoth...| fake|  3586|
|If Donald Trump h...| fake|  1862|
|Donald Trump just...| fake|  2058|
|Donald Trump is c...| fake|  1917|
+--------------------+-----+------+
only showing top 5 rows



In [26]:
data.groupBy('class').mean().show()

+-----+------------------+
|class|       avg(length)|
+-----+------------------+
| fake|2475.3811952099927|
| true| 2368.108248395621|
+-----+------------------+



- There is not much difference in length from fake and true news

### Feature Transformations

In [27]:
fake_true_to_num = StringIndexer(inputCol='class', outputCol='label')

tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

In [28]:
clean_up = VectorAssembler(inputCols=['tf_idf'], 
                           outputCol='features')

### Pipeline

In [29]:
data_prep_pipe = Pipeline(stages=[fake_true_to_num,
                                  tokenizer,
                                  stopremove,
                                  count_vec,
                                  idf,
                                  clean_up
                                 ])

In [30]:
cleaner = data_prep_pipe.fit(data)

In [31]:
clean_data = cleaner.transform(data)

### Training and Evaluation

In [32]:
clean_data = clean_data.select('label', 'features')
clean_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262144,[0,1,3,4,...|
|  1.0|(262144,[0,1,7,9,...|
|  1.0|(262144,[0,1,2,4,...|
|  1.0|(262144,[0,1,2,4,...|
|  1.0|(262144,[0,1,2,4,...|
+-----+--------------------+
only showing top 5 rows



In [33]:
# Split into training, testing set
training, testing = clean_data.randomSplit([0.8, 0.2])

### Use NaiveBayes to predict model

In [34]:
nb = NaiveBayes()

In [35]:
predictor_nb = nb.fit(training)

In [36]:
test_results_nb = predictor_nb.transform(testing)

In [37]:
test_results_nb.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(262144,[0,1,2,3,...|[-26238.441942200...|           [1.0,0.0]|       0.0|
|  0.0|(262144,[0,1,2,3,...|[-17471.817844328...|           [1.0,0.0]|       0.0|
|  0.0|(262144,[0,1,2,3,...|[-22623.605710002...|           [1.0,0.0]|       0.0|
|  0.0|(262144,[0,1,2,3,...|[-9488.3940867271...|[1.0,4.3575672301...|       0.0|
|  0.0|(262144,[0,1,2,3,...|[-18317.369734260...|           [1.0,0.0]|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [38]:
test_results_nb.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 3393|
|  0.0|       1.0|   25|
|  1.0|       0.0|   96|
|  0.0|       0.0| 4167|
+-----+----------+-----+



In [39]:
acc_eval_nb = MulticlassClassificationEvaluator()
acc_nb = acc_eval_nb.evaluate(test_results_nb)
print('Accuracy of model using NaiveBayes is: {}'.format(acc_nb))

Accuracy of model using NaiveBayes is: 0.9842320189122389


### Use Random Forest to predict the model

In [40]:
rf = RandomForestClassifier()

In [41]:
predictor_rf = rf.fit(training)

In [42]:
test_results_rf = predictor_rf.transform(testing)

In [43]:
test_results_rf.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(262144,[0,1,2,3,...|[12.0969949045104...|[0.60484974522552...|       0.0|
|  0.0|(262144,[0,1,2,3,...|[13.7288336404883...|[0.68644168202441...|       0.0|
|  0.0|(262144,[0,1,2,3,...|[11.5538763930763...|[0.57769381965381...|       0.0|
|  0.0|(262144,[0,1,2,3,...|[12.7398412857785...|[0.63699206428892...|       0.0|
|  0.0|(262144,[0,1,2,3,...|[12.8145650406246...|[0.64072825203123...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [44]:
test_results_rf.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 1800|
|  0.0|       1.0|   29|
|  1.0|       0.0| 1689|
|  0.0|       0.0| 4163|
+-----+----------+-----+



In [45]:
acc_eval_rf = MulticlassClassificationEvaluator()
acc_rf = acc_eval_rf.evaluate(test_results_rf)
print('Accuracy of model using Random Forest is: {}'.format(acc_rf))

Accuracy of model using Random Forest is: 0.7599055719105214


- Naive Bayes has ~98.4% accuracy so we'll choose Naive Bayes