In [1]:
import findspark
findspark.init()

In [2]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types 
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer, OneHotEncoderEstimator
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes

In [3]:
spark = SparkSession.builder.appName('news').getOrCreate()

In [4]:
dfFake = spark.read.csv('Fake.csv', header= True, inferSchema=True)
dfFake.show(5)

+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
+--------------------+--------------------+-------+-----------------+
only showing top 5 rows



In [5]:
dfTrue = spark.read.csv('True.csv', header= True, inferSchema=True)
dfTrue.show(5)

+--------------------+--------------------+------------+------------------+
|               title|                text|     subject|              date|
+--------------------+--------------------+------------+------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |
+--------------------+--------------------+------------+------------------+
only showing top 5 rows



##### Có thể drop cột date do không ảnh hưởng đến model

In [6]:
dfFake = dfFake.drop('date')
dfTrue = dfTrue.drop('date')

In [7]:
dfFake.count()

23489

In [8]:
dfTrue.count()

21417

##### Dữ liệu khá cân bằng

In [9]:
display(dfFake.printSchema(), dfTrue.printSchema())

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)



None

None

##### Trước khi merge dữ liệu cần tạo 2 cột classifications label

In [10]:
from pyspark.sql.functions import lit
dfFake = dfFake.withColumn('classification', lit('Fake'))
dfFake.show()

+--------------------+--------------------+-------+--------------+
|               title|                text|subject|classification|
+--------------------+--------------------+-------+--------------+
| Donald Trump Sen...|Donald Trump just...|   News|          Fake|
| Drunk Bragging T...|House Intelligenc...|   News|          Fake|
| Sheriff David Cl...|On Friday, it was...|   News|          Fake|
| Trump Is So Obse...|On Christmas day,...|   News|          Fake|
| Pope Francis Jus...|Pope Francis used...|   News|          Fake|
| Racist Alabama C...|The number of cas...|   News|          Fake|
| Fresh Off The Go...|Donald Trump spen...|   News|          Fake|
| Trump Said Some ...|In the wake of ye...|   News|          Fake|
| Former CIA Direc...|Many people have ...|   News|          Fake|
| WATCH: Brand-New...|Just when you mig...|   News|          Fake|
| Papa John’s Foun...|A centerpiece of ...|   News|          Fake|
| WATCH: Paul Ryan...|Republicans are w...|   News|          F

In [11]:
dfTrue = dfTrue.withColumn('classification', lit('True'))
dfTrue.show()

+--------------------+--------------------+--------------------+--------------+
|               title|                text|             subject|classification|
+--------------------+--------------------+--------------------+--------------+
|As U.S. budget fi...|WASHINGTON (Reute...|        politicsNews|          True|
|U.S. military to ...|WASHINGTON (Reute...|        politicsNews|          True|
|Senior U.S. Repub...|WASHINGTON (Reute...|        politicsNews|          True|
|FBI Russia probe ...|WASHINGTON (Reute...|        politicsNews|          True|
|Trump wants Posta...|SEATTLE/WASHINGTO...|        politicsNews|          True|
|White House, Cong...|WEST PALM BEACH, ...|        politicsNews|          True|
|Trump says Russia...|WEST PALM BEACH, ...|        politicsNews|          True|
|Factbox: Trump on...|The following sta...|        politicsNews|          True|
|Trump on Twitter ...|The following sta...|        politicsNews|          True|
|Alabama official ...|WASHINGTON (Reute.

In [12]:
df = dfFake.union(dfTrue)
df.show()

+--------------------+--------------------+-------+--------------+
|               title|                text|subject|classification|
+--------------------+--------------------+-------+--------------+
| Donald Trump Sen...|Donald Trump just...|   News|          Fake|
| Drunk Bragging T...|House Intelligenc...|   News|          Fake|
| Sheriff David Cl...|On Friday, it was...|   News|          Fake|
| Trump Is So Obse...|On Christmas day,...|   News|          Fake|
| Pope Francis Jus...|Pope Francis used...|   News|          Fake|
| Racist Alabama C...|The number of cas...|   News|          Fake|
| Fresh Off The Go...|Donald Trump spen...|   News|          Fake|
| Trump Said Some ...|In the wake of ye...|   News|          Fake|
| Former CIA Direc...|Many people have ...|   News|          Fake|
| WATCH: Brand-New...|Just when you mig...|   News|          Fake|
| Papa John’s Foun...|A centerpiece of ...|   News|          Fake|
| WATCH: Paul Ryan...|Republicans are w...|   News|          F

In [13]:
df.count()

44906

In [14]:
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
title,0
text,0
subject,0
classification,0


##### Không có dữ liệu NaN

In [15]:
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
title,0
text,8
subject,8
classification,0


##### Có rất ít dữ liệu null => xóa các cột null

In [16]:
df = df.dropna()
df.count()

44898

In [17]:
df.crosstab('subject', 'classification').count()

823

In [18]:
df.crosstab('subject', 'classification').show()

+----------------------+----+----+
|subject_classification|Fake|True|
+----------------------+----+----+
|   fellow members t...|   1|   0|
|   2016Proud to see...|   1|   0|
|   2017Watch:This i...|   1|   0|
|  "" @TedCruz? Hard...|   1|   0|
|   2016People quick...|   1|   0|
|   ""regret"" or ""...|   1|   0|
|   2015And the runn...|   3|   0|
|   2017Pathetic exc...|   1|   0|
|        if implemented|   0|   1|
|   2016Here s the v...|   2|   0|
|   2016Kaine s stat...|   1|   0|
|   Don't shoot lie....|   2|   0|
|   according to an ...|   0|   1|
|   matters related ...|   0|   1|
|   but I'm tired of...|   2|   0|
|  "" a Chuck Schume...|   1|   0|
|   2017Featured ima...|   1|   0|
|   2016Rigged! @goo...|   1|   0|
|  "" and has said h...|   0|   1|
|   when does ""art"...|   1|   0|
+----------------------+----+----+
only showing top 20 rows



In [19]:
df.count() - df.distinct().count()

474

In [20]:
df = df.dropDuplicates()
df.count()

44424

##### Có thể sử dụng subject làm 1 feature

In [21]:
token1 = Tokenizer(inputCol = 'title', outputCol = 'token_text1')
swremove1 = StopWordsRemover(inputCol = 'token_text1', outputCol = 'stop_tokens1')
count_vec1 = CountVectorizer(inputCol = 'stop_tokens1', outputCol = 'c_vec1')
idf1 = IDF(inputCol = 'c_vec1', outputCol = 'tf_idf1')

token2 = Tokenizer(inputCol = 'text', outputCol = 'token_text2')
swremove2 = StopWordsRemover(inputCol = 'token_text2', outputCol = 'stop_tokens2')
count_vec2 = CountVectorizer(inputCol = 'stop_tokens2', outputCol = 'c_vec2')
idf2 = IDF(inputCol = 'c_vec2', outputCol = 'tf_idf2')

token3 = Tokenizer(inputCol = 'subject', outputCol = 'token_text3')
swremove3 = StopWordsRemover(inputCol = 'token_text3', outputCol = 'stop_tokens3')
count_vec3 = CountVectorizer(inputCol = 'stop_tokens3', outputCol = 'c_vec3')
idf3 = IDF(inputCol = 'c_vec3', outputCol = 'tf_idf3')

idx = StringIndexer(inputCol = 'classification', outputCol = 'label')

In [22]:
clean = VectorAssembler(inputCols = ['tf_idf1', 'tf_idf2', 'tf_idf3'], outputCol = 'features')

In [23]:
pipe = Pipeline(stages = [idx, token1, swremove1, count_vec1, idf1, token2, swremove2, count_vec2, idf2, token3, swremove3, count_vec3, idf3, clean])

In [24]:
cleaner = pipe.fit(df)

In [25]:
final = cleaner.transform(df)

In [26]:
final = final.select(['features', 'label'])

In [27]:
final.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(310220,[0,1,16,3...|  0.0|
|(310220,[0,1,21,1...|  0.0|
|(310220,[1,3,98,1...|  0.0|
|(310220,[0,1,42,1...|  0.0|
|(310220,[0,1,35,8...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [29]:
train_data, test_data = final.randomSplit([0.75, 0.25])