In [2]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:

sc = SparkContext("local","classifier")

21/10/21 23:00:44 WARN Utils: Your hostname, Cinderellas-Blue.local resolves to a loopback address: 127.0.0.1; using 172.17.124.132 instead (on interface en0)
21/10/21 23:00:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/10/21 23:00:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/21 23:00:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
spark = SparkSession.builder.appName("Classifier").config('spark.executor.memory', '1G').getOrCreate()

In [6]:
df = spark.read.csv("data/gossipcop_tweets_processed.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)



In [17]:
df.show()

+------------------+-------------------+--------------+-------------+------------------+--------------------+--------+---------------+------------------+--------------------+----+-------+
|          tweet_id|         created_at|favorite_count|retweet_count|           user_id|            location|verified|followers_count|            source|                text|fake|news_id|
+------------------+-------------------+--------------+-------------+------------------+--------------------+--------+---------------+------------------+--------------------+----+-------+
|853737542499348480|2017-04-16 22:31:06|             0|            0|         527095730|   Sydney, Australia|   false|        11062.0|         Hootsuite|"Bristol Palin Lo...| 0.0| 843836|
|853743876854530048|2017-04-16 22:56:16|             0|            0|         230056211|                null|   false|          297.0|           dlvr.it|"Bristol Palin Lo...| 0.0| 843836|
|853842612775665664|2017-04-17 05:28:37|             0|     

In [3]:
#df = df.select('news_id')

In [7]:
df.dtypes

[('tweet_id', 'bigint'),
 ('created_at', 'string'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'bigint')]

In [8]:
df = df.select('created_at','text','location','verified','source', 'followers_count','retweet_count','favorite_count','fake')

In [9]:
df = df.withColumn('created_at', to_timestamp(col('created_at'), 'yyyy-MM-dd HH:mm:ss'))

In [23]:
df = df.withColumn('created_at', df.created_at.cast("long"))

In [24]:
df.dtypes

[('created_at', 'bigint'),
 ('text', 'string'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('source', 'string'),
 ('followers_count', 'double'),
 ('retweet_count', 'int'),
 ('favorite_count', 'int'),
 ('fake', 'double')]

In [25]:
df = df.dropna(subset=('text'))

In [26]:
df = df.fillna('null', subset = ('source', 'location'))

In [13]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [14]:
tokenizer_location = Tokenizer(inputCol='location',outputCol='location1')
stopwords_remover_location = StopWordsRemover(inputCol='location1',outputCol='location2')
vectorizer_location = CountVectorizer(inputCol='location2',outputCol='location3')
idf_location = IDF(inputCol='location3',outputCol='location_vc')

In [15]:
tokenizer_source = Tokenizer(inputCol='source',outputCol='source1')
stopwords_remover_source = StopWordsRemover(inputCol='source1',outputCol='source2')
vectorizer_source = CountVectorizer(inputCol='source2',outputCol='source3')
idf_source = IDF(inputCol='source3',outputCol='source_vc')

In [27]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [28]:
input_cols = ['created_at','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count']

In [29]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

In [30]:
lr = LogisticRegression(featuresCol='features',labelCol='fake')

In [31]:
pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
                            tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
                            assembler,lr])

In [32]:
lr_model = pipeline.fit(trainDF)

21/10/21 23:13:28 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
21/10/21 23:15:05 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:16:20 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:17:12 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/10/21 23:17:12 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
21/10/21 23:17:17 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:17:22 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:17:27 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:17:32 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:17:34 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/21 23:17:36 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/2

In [33]:
predictions = lr_model.transform(testDF)

In [34]:
predictions = predictions.select('fake','prediction')

In [35]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [36]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

21/10/21 23:25:44 WARN DAGScheduler: Broadcasting large task binary with size 28.7 MiB


Accuracy: 0.9720232877392038


21/10/21 23:26:48 WARN DAGScheduler: Broadcasting large task binary with size 28.7 MiB


F1: 0.9720037686883372


21/10/21 23:27:50 WARN DAGScheduler: Broadcasting large task binary with size 28.7 MiB


Precision: 0.971997792961933


21/10/21 23:28:42 WARN DAGScheduler: Broadcasting large task binary with size 28.7 MiB


Recall: 0.9720232877392039
