In [29]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression

from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import warnings
warnings.filterwarnings("ignore")

In [30]:
spark = SparkSession.builder.appName("Classifier").config('spark.executor.memory','8G').config("spark.driver.memory", "8G").getOrCreate()

In [31]:
df = spark.read.csv("gossipcop_with_users.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)



In [32]:
df.show()

+------------------+----------------+--------------+-------------+------------------+--------------------+--------+---------------+------------------+--------------------+----+-------+---------------+-------------+------------+----------------+-----------+--------------+--------------------+---------------+
|          tweet_id|created_at_tweet|favorite_count|retweet_count|           user_id|            location|verified|followers_count|            source|                text|fake|news_id|created_at_user|friends_count|listed_count|favourites_count|geo_enabled|statuses_count|has_extended_profile|default_profile|
+------------------+----------------+--------------+-------------+------------------+--------------------+--------+---------------+------------------+--------------------+----+-------+---------------+-------------+------------+----------------+-----------+--------------+--------------------+---------------+
|853737542499348480|      1492381866|             0|            0|       

In [33]:
#df = df.select('news_id')

In [34]:
df.dtypes

[('tweet_id', 'bigint'),
 ('created_at_tweet', 'int'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'bigint'),
 ('created_at_user', 'double'),
 ('friends_count', 'double'),
 ('listed_count', 'double'),
 ('favourites_count', 'double'),
 ('geo_enabled', 'boolean'),
 ('statuses_count', 'double'),
 ('has_extended_profile', 'boolean'),
 ('default_profile', 'boolean')]

In [35]:
df = df.select('created_at_tweet','created_at_user','text','location','verified','source', 'followers_count','retweet_count','favorite_count','friends_count','listed_count','favourites_count','statuses_count','geo_enabled','has_extended_profile','default_profile','fake')
#df = df.select('created_at','text','location','verified','source', 'followers_count','retweet_count','favorite_count','fake')

In [36]:
#df = df.withColumn('created_at_tweet', to_timestamp(col('created_at_tweet'), 'yyyy-MM-dd HH:mm:ss'))
#df = df.withColumn('created_at_user', to_timestamp(col('created_at_user'), 'yyyy-MM-dd HH:mm:ss'))

In [37]:
#df = df.withColumn('created_at', df.created_at.cast("long"))

In [38]:
df.dtypes

[('created_at_tweet', 'int'),
 ('created_at_user', 'double'),
 ('text', 'string'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('source', 'string'),
 ('followers_count', 'double'),
 ('retweet_count', 'int'),
 ('favorite_count', 'int'),
 ('friends_count', 'double'),
 ('listed_count', 'double'),
 ('favourites_count', 'double'),
 ('statuses_count', 'double'),
 ('geo_enabled', 'boolean'),
 ('has_extended_profile', 'boolean'),
 ('default_profile', 'boolean'),
 ('fake', 'double')]

In [39]:
df = df.dropna(subset=('text'))
df = df.dropna(subset = ('geo_enabled','has_extended_profile','default_profile'))

In [40]:
df = df.fillna('null', subset = ('source', 'location'))

In [41]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [42]:
tokenizer_location = Tokenizer(inputCol='location',outputCol='location1')
stopwords_remover_location = StopWordsRemover(inputCol='location1',outputCol='location2')
vectorizer_location = CountVectorizer(inputCol='location2',outputCol='location3')
idf_location = IDF(inputCol='location3',outputCol='location_vc')

In [43]:
tokenizer_source = Tokenizer(inputCol='source',outputCol='source1')
stopwords_remover_source = StopWordsRemover(inputCol='source1',outputCol='source2')
vectorizer_source = CountVectorizer(inputCol='source2',outputCol='source3')
idf_source = IDF(inputCol='source3',outputCol='source_vc')

In [44]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [45]:
input_cols = ['created_at_tweet','created_at_user','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count','friends_count','listed_count','favourites_count','statuses_count','geo_enabled','has_extended_profile','default_profile']

#input_cols = ['created_at','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count']

In [46]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

In [47]:
gbt = GBTClassifier(featuresCol='features',labelCol='fake')

In [48]:
pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
                            tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
                            assembler,gbt])

In [49]:
gbt_model = pipeline.fit(trainDF)

21/10/29 13:45:25 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
21/10/29 13:46:29 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/29 13:46:35 WARN DAGScheduler: Broadcasting large task binary with size 26.4 MiB
21/10/29 13:47:15 WARN DAGScheduler: Broadcasting large task binary with size 29.4 MiB
21/10/29 13:50:04 WARN DAGScheduler: Broadcasting large task binary with size 1215.9 KiB
21/10/29 13:50:06 WARN DAGScheduler: Broadcasting large task binary with size 31.9 MiB
21/10/29 13:50:18 WARN MemoryStore: Not enough space to cache rdd_679_0 in memory! (computed 2.5 GiB so far)
21/10/29 13:50:18 WARN BlockManager: Persisting block rdd_679_0 to disk instead.
21/10/29 14:43:15 WARN MemoryStore: Not enough space to cache rdd_679_0 in memory! (computed 3.9 GiB so far)
21/10/29 14:43:16 WARN MemoryStore: Not enough space to cache rdd_679_0 in memory! (computed 208.4 MiB so far)
21/10/29 15:22:35 WARN DAGScheduler: Broadcasting large task binary

In [50]:
predictions = gbt_model.transform(testDF)

In [51]:
predictions = predictions.select('fake','prediction')

In [52]:

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')


In [53]:

print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))


21/10/31 19:44:53 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB


Accuracy: 0.8833878694347548


21/10/31 19:45:18 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB


F1: 0.881948880869628


21/10/31 19:45:41 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB


Precision: 0.8833979066698382


21/10/31 19:46:04 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB


Recall: 0.8833878694347548




In [54]:
TP = predictions[(predictions.prediction == 1) & (predictions.fake == 1)].count()
FN = predictions[(predictions.prediction == 0) & (predictions.fake == 1)].count()
TN = predictions[(predictions.prediction == 0) & (predictions.fake == 0)].count()
FP = predictions[(predictions.prediction == 1) & (predictions.fake == 0)].count()

21/10/31 19:46:26 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB
21/10/31 19:46:50 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB
21/10/31 19:47:14 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB
21/10/31 19:47:39 WARN DAGScheduler: Broadcasting large task binary with size 26.5 MiB


In [55]:
print('Accuracy:', (TP+TN)/(TP+FP+FN+TN))
print('Precision:', (TP)/(TP+FP))
print('Recall:', (TP)/(TP+FN))
print('F1:',(2*TP)/(2*TP+FP+FN))

Accuracy: 0.8833878694347548
Precision: 0.8835605665516014
Recall: 0.7920553292222297
F1: 0.8353094013883345
