In [4]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import warnings
warnings.filterwarnings("ignore")

Import necessary packages and read csv file. 

In [5]:
spark = SparkSession.builder.appName("Classifier").config('spark.executor.memory','8G').config("spark.driver.memory", "8G").getOrCreate()

In [6]:
df = spark.read.csv("politifact_tweets_processed.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)



In [7]:
df.show()

+------------------+-------------------+--------------+-------------+----------+--------------------+--------+---------------+--------------------+--------------------+----+-------+
|          tweet_id|         created_at|favorite_count|retweet_count|   user_id|            location|verified|followers_count|              source|                text|fake|news_id|
+------------------+-------------------+--------------+-------------+----------+--------------------+--------+---------------+--------------------+--------------------+----+-------+
|         890268075|2008-08-17 15:47:34|             0|            0|  15189613|                null|   false|          113.0|  Twitter Web Client|'usa: The Long Ru...| 0.0|    636|
|         890634203|2008-08-18 01:46:24|             0|            0|  15209631|                null|   false|           68.0|  Twitter Web Client|'d: The Long Run ...| 0.0|    636|
|         889821427|2008-08-17 00:53:16|             0|            0|  14882359|          

In [8]:
#df = df.select('news_id')

In [9]:
df.dtypes

[('tweet_id', 'bigint'),
 ('created_at', 'string'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'int')]

In [10]:
df = df.select('created_at','text','location','verified','source', 'followers_count','retweet_count','favorite_count','fake')

select all the features we want

In [11]:
df = df.withColumn('created_at', to_timestamp(col('created_at'), 'yyyy-MM-dd HH:mm:ss'))

In [12]:
df = df.withColumn('created_at', df.created_at.cast("long"))

In [13]:
df.dtypes

[('created_at', 'bigint'),
 ('text', 'string'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('source', 'string'),
 ('followers_count', 'double'),
 ('retweet_count', 'int'),
 ('favorite_count', 'int'),
 ('fake', 'double')]

In [14]:
df = df.dropna(subset=('text'))

In [15]:
df = df.fillna('null', subset = ('source', 'location'))

Vectorizer the string values

In [16]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [17]:
tokenizer_location = Tokenizer(inputCol='location',outputCol='location1')
stopwords_remover_location = StopWordsRemover(inputCol='location1',outputCol='location2')
vectorizer_location = CountVectorizer(inputCol='location2',outputCol='location3')
idf_location = IDF(inputCol='location3',outputCol='location_vc')

In [18]:
tokenizer_source = Tokenizer(inputCol='source',outputCol='source1')
stopwords_remover_source = StopWordsRemover(inputCol='source1',outputCol='source2')
vectorizer_source = CountVectorizer(inputCol='source2',outputCol='source3')
idf_source = IDF(inputCol='source3',outputCol='source_vc')

In [19]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [20]:
input_cols = ['created_at','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count']

In [21]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

Use GBT as the model. Pipeline all the steps

In [22]:
gbt = GBTClassifier(featuresCol='features',labelCol='fake')

In [23]:
pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
                            tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
                            assembler,gbt])

In [24]:
gbt_model = pipeline.fit(trainDF)

21/10/22 12:19:37 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
21/10/22 12:20:07 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:11 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:29 WARN DAGScheduler: Broadcasting large task binary with size 29.1 MiB
21/10/22 12:22:45 WARN DAGScheduler: Broadcasting large task binary with size 1198.4 KiB
21/10/22 12:22:47 WARN DAGScheduler: Broadcasting large task binary with size 31.6 MiB
21/10/22 12:23:04 WARN MemoryStore: Not enough space to cache rdd_86_0 in memory! (computed 3.8 GiB so far)
21/10/22 12:23:04 WARN BlockManager: Persisting block rdd_86_0 to disk instead.
21/10/22 12:46:51 WARN MemoryStore: Not enough space to cache rdd_86_0 in memory! (computed 3.8 GiB so far)
21/10/22 12:46:51 WARN MemoryStore: Not enough space to cache rdd_86_0 in memory! (computed 316.5 MiB so far)
21/10/22 13:01:23 WARN DAGScheduler: Broadcasting large task binary wit

In [25]:
predictions = gbt_model.transform(testDF)

In [26]:
predictions = predictions.select('fake','prediction')

In [27]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [28]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

21/10/23 07:55:56 WARN DAGScheduler: Broadcasting large task binary with size 26.2 MiB
21/10/23 07:55:58 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/10/23 07:55:58 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Accuracy: 0.8883961275787915


21/10/23 07:56:05 WARN DAGScheduler: Broadcasting large task binary with size 26.2 MiB


F1: 0.8810972042119023


21/10/23 07:56:14 WARN DAGScheduler: Broadcasting large task binary with size 26.2 MiB


Precision: 0.8923251137983503


21/10/23 07:56:22 WARN DAGScheduler: Broadcasting large task binary with size 26.2 MiB


Recall: 0.8883961275787915


