In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier

from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


import warnings
warnings.filterwarnings("ignore")
#sc = SparkContext(master="local")
spark = SparkSession.builder.appName("Classifier").config('spark.executor.memory','8G').config("spark.driver.memory", "8G").getOrCreate()


21/10/22 01:08:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
df = spark.read.csv("gossipcop_news.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)



In [3]:
dir(pyspark.ml.classification)

['ABCMeta',
 'ArrayType',
 'BinaryLogisticRegressionSummary',
 'BinaryLogisticRegressionTrainingSummary',
 'BinaryRandomForestClassificationSummary',
 'BinaryRandomForestClassificationTrainingSummary',
 'ClassificationModel',
 'Classifier',
 'DataFrame',
 'DecisionTreeClassificationModel',
 'DecisionTreeClassifier',
 'DecisionTreeRegressionModel',
 'DefaultParamsReader',
 'DefaultParamsWriter',
 'DoubleType',
 'Estimator',
 'FMClassificationModel',
 'FMClassificationSummary',
 'FMClassificationTrainingSummary',
 'FMClassifier',
 'GBTClassificationModel',
 'GBTClassifier',
 'HasAggregationDepth',
 'HasBlockSize',
 'HasElasticNetParam',
 'HasFitIntercept',
 'HasMaxBlockSizeInMB',
 'HasMaxIter',
 'HasParallelism',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasRegParam',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HasTol',
 'HasTrainingSummary',
 'HasWeightCol',
 'JavaMLReadable',
 'JavaMLReader',
 'JavaMLWritable',
 'JavaMLWri

In [4]:
df.dtypes

[('news_id', 'bigint'),
 ('url', 'string'),
 ('text', 'string'),
 ('num_images', 'int'),
 ('domain', 'string'),
 ('publish_date', 'string'),
 ('fake', 'int'),
 ('authors', 'string')]

In [5]:
df_v = df.select('text','num_images','domain','publish_date','fake')

In [6]:
df_v = df_v.dropna(subset=('text'))
df_v = df_v.dropna(subset=('domain'))

In [7]:
df_v.count()

17559

In [8]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [9]:
tokenizer_domain = Tokenizer(inputCol='domain',outputCol='domain1')
stopwords_remover_domain = StopWordsRemover(inputCol='domain1',outputCol='domain2')
vectorizer_domain = CountVectorizer(inputCol='domain2',outputCol='domain3')
idf_domain = IDF(inputCol='domain3',outputCol='domain_vc')

In [10]:
(trainDF,testDF) = df_v.randomSplit((0.7,0.3),seed=42)

In [11]:
trainDF

DataFrame[text: string, num_images: int, domain: string, publish_date: string, fake: int]

In [12]:
assembler = VectorAssembler(inputCols=['text_vc','domain_vc','num_images'],outputCol="features")

In [13]:
lr = LogisticRegression(featuresCol='features',labelCol='fake')

In [14]:
gbt = GBTClassifier(featuresCol='features',labelCol='fake')

In [15]:
pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_domain,stopwords_remover_domain,vectorizer_domain,idf_domain,
                            assembler,gbt])

In [16]:
model = pipeline.fit(trainDF)

21/10/22 01:09:12 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
21/10/22 01:09:26 WARN DAGScheduler: Broadcasting large task binary with size 21.6 MiB
21/10/22 01:09:28 WARN DAGScheduler: Broadcasting large task binary with size 21.6 MiB
21/10/22 01:09:37 WARN DAGScheduler: Broadcasting large task binary with size 24.2 MiB
21/10/22 01:12:09 WARN DAGScheduler: Broadcasting large task binary with size 1038.2 KiB
21/10/22 01:12:16 WARN DAGScheduler: Broadcasting large task binary with size 28.5 MiB
21/10/22 01:12:36 WARN MemoryStore: Not enough space to cache rdd_70_0 in memory! (computed 3.3 GiB so far)
21/10/22 01:12:37 WARN BlockManager: Persisting block rdd_70_0 to disk instead.
21/10/22 01:13:38 WARN MemoryStore: Not enough space to cache rdd_70_0 in memory! (computed 3.3 GiB so far)
21/10/22 01:13:39 WARN MemoryStore: Not enough space to cache rdd_70_0 in memory! (computed 632.6 MiB so far)
21/10/22 01:14:16 WARN DAGScheduler: Broadcasting large task binary wit

In [17]:
predictions = model.transform(testDF)

In [18]:
predictions = predictions.select('fake','prediction')

In [19]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [20]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

21/10/22 02:10:05 WARN DAGScheduler: Broadcasting large task binary with size 21.7 MiB
21/10/22 02:10:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/10/22 02:10:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Accuracy: 0.8336854234684079


21/10/22 02:10:10 WARN DAGScheduler: Broadcasting large task binary with size 21.7 MiB


F1: 0.8159009537884305


21/10/22 02:10:14 WARN DAGScheduler: Broadcasting large task binary with size 21.7 MiB


Precision: 0.8276618513792522


21/10/22 02:10:18 WARN DAGScheduler: Broadcasting large task binary with size 21.7 MiB


Recall: 0.8336854234684079




In [21]:
predictions.count()

5207