In [3]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline 
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC



from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import warnings
warnings.filterwarnings("ignore")

In [29]:
# Initialize Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.csv("./data_new/gossipcop_news.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)

In [31]:
df.count()

19798

In [32]:
df.dtypes

[('news_id', 'bigint'),
 ('url', 'string'),
 ('text', 'string'),
 ('num_images', 'int'),
 ('domain', 'string'),
 ('publish_date', 'string'),
 ('fake', 'int'),
 ('authors', 'string')]

In [33]:
df = df.select('text','num_images','domain','fake')

In [34]:
df.dtypes

[('text', 'string'),
 ('num_images', 'int'),
 ('domain', 'string'),
 ('fake', 'int')]

In [35]:
df = df.dropna(subset=('text', 'domain'))
df = df.fillna(0, subset = ('num_images'))

In [36]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [37]:
tokenizer_domain = Tokenizer(inputCol='domain',outputCol='domain1')
stopwords_remover_domain = StopWordsRemover(inputCol='domain1',outputCol='domain2')
vectorizer_domain = CountVectorizer(inputCol='domain2',outputCol='domain3')
idf_domain = IDF(inputCol='domain3',outputCol='domain_vc')

In [38]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [39]:
assembler = VectorAssembler(inputCols=['text_vc','num_images'],outputCol="features")

In [40]:
# featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")
# labelIndexer = StringIndexer(inputCol="fake", outputCol="indexedLabel")

In [41]:
rf = RandomForestClassifier(labelCol="fake", featuresCol="features", numTrees=10)

In [42]:
# pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
#                             tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
#                             tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
#                             assembler,labelIndexer, featureIndexer, rf])

pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_domain,stopwords_remover_domain,vectorizer_domain,idf_domain,
                            assembler, rf])

In [43]:
model = pipeline.fit(trainDF)


In [44]:
predictions = model.transform(testDF)

In [45]:
predictions = predictions.select('fake','prediction')

In [46]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [47]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

Accuracy: 0.7557134626464375
F1: 0.6505649694845179
Precision: 0.5711028376250684
Recall: 0.7557134626464375
