In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:

sc = SparkContext("local","classifier")

21/10/22 12:18:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/22 12:18:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark = SparkSession.builder.appName("Classifier").config('spark.executor.memory', '1G').getOrCreate()

In [5]:
df = spark.read.csv("data/politifact_tweets_processed.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)



In [6]:
df.show()

+------------------+-------------------+--------------+-------------+----------+--------------------+--------+---------------+--------------------+--------------------+----+-------+
|          tweet_id|         created_at|favorite_count|retweet_count|   user_id|            location|verified|followers_count|              source|                text|fake|news_id|
+------------------+-------------------+--------------+-------------+----------+--------------------+--------+---------------+--------------------+--------------------+----+-------+
|         890268075|2008-08-17 15:47:34|             0|            0|  15189613|                null|   false|          113.0|  Twitter Web Client|'usa: The Long Ru...| 0.0|    636|
|         890634203|2008-08-18 01:46:24|             0|            0|  15209631|                null|   false|           68.0|  Twitter Web Client|'d: The Long Run ...| 0.0|    636|
|         889821427|2008-08-17 00:53:16|             0|            0|  14882359|          

In [7]:
#df = df.select('news_id')

In [8]:
df.dtypes

[('tweet_id', 'bigint'),
 ('created_at', 'string'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'int')]

In [9]:
df = df.select('created_at','text','location','verified','source', 'followers_count','retweet_count','favorite_count','fake')

In [10]:
df = df.withColumn('created_at', to_timestamp(col('created_at'), 'yyyy-MM-dd HH:mm:ss'))

In [11]:
df = df.withColumn('created_at', df.created_at.cast("long"))

In [12]:
df.dtypes

[('created_at', 'bigint'),
 ('text', 'string'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('source', 'string'),
 ('followers_count', 'double'),
 ('retweet_count', 'int'),
 ('favorite_count', 'int'),
 ('fake', 'double')]

In [13]:
df = df.dropna(subset=('text'))

In [14]:
df = df.fillna('null', subset = ('source', 'location'))

In [15]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [16]:
tokenizer_location = Tokenizer(inputCol='location',outputCol='location1')
stopwords_remover_location = StopWordsRemover(inputCol='location1',outputCol='location2')
vectorizer_location = CountVectorizer(inputCol='location2',outputCol='location3')
idf_location = IDF(inputCol='location3',outputCol='location_vc')

In [17]:
tokenizer_source = Tokenizer(inputCol='source',outputCol='source1')
stopwords_remover_source = StopWordsRemover(inputCol='source1',outputCol='source2')
vectorizer_source = CountVectorizer(inputCol='source2',outputCol='source3')
idf_source = IDF(inputCol='source3',outputCol='source_vc')

In [18]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [19]:
input_cols = ['created_at','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count']

In [20]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

In [21]:
lr = LogisticRegression(featuresCol='features',labelCol='fake')

In [22]:
pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
                            tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
                            assembler,lr])

In [23]:
lr_model = pipeline.fit(trainDF)

21/10/22 12:19:16 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
21/10/22 12:19:41 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:19:54 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/10/22 12:20:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
21/10/22 12:20:08 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:09 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:11 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:12 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:14 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/22 12:20:16 WARN DAGScheduler: Broadcasting large task binary with size 26.1 MiB
21/10/2

In [24]:
predictions = lr_model.transform(testDF)

In [25]:
predictions = predictions.select('fake','prediction')

In [26]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [27]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1 score:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

21/10/22 12:22:36 WARN DAGScheduler: Broadcasting large task binary with size 28.4 MiB


Accuracy: 0.9609528235748214


21/10/22 12:22:46 WARN DAGScheduler: Broadcasting large task binary with size 28.4 MiB


F1 score: 0.9608444327485309


21/10/22 12:22:54 WARN DAGScheduler: Broadcasting large task binary with size 28.4 MiB


Precision: 0.9607886620012115


21/10/22 12:23:02 WARN DAGScheduler: Broadcasting large task binary with size 28.4 MiB


Recall: 0.9609528235748214


