In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
import warnings
warnings.filterwarnings("ignore")


In [3]:
spark = SparkSession.builder.appName("Classifier").config('spark.executor.memory','8G').config("spark.driver.memory", "8G").getOrCreate()

In [4]:
df = spark.read.csv("data/gossipcop_tweets_processed.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)

In [5]:
df = df.select('created_at','text','location','verified','source', 'followers_count','retweet_count','favorite_count','fake')

In [6]:
df = df.withColumn('created_at', to_timestamp(col('created_at'), 'yyyy-MM-dd HH:mm:ss'))

In [7]:
df = df.withColumn('created_at', df.created_at.cast("long"))

In [8]:
df = df.dropna(subset=('text'))

In [9]:
df = df.fillna('null', subset = ('source', 'location'))

In [10]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [11]:
tokenizer_location = Tokenizer(inputCol='location',outputCol='location1')
stopwords_remover_location = StopWordsRemover(inputCol='location1',outputCol='location2')
vectorizer_location = CountVectorizer(inputCol='location2',outputCol='location3')
idf_location = IDF(inputCol='location3',outputCol='location_vc')

In [12]:
tokenizer_source = Tokenizer(inputCol='source',outputCol='source1')
stopwords_remover_source = StopWordsRemover(inputCol='source1',outputCol='source2')
vectorizer_source = CountVectorizer(inputCol='source2',outputCol='source3')
idf_source = IDF(inputCol='source3',outputCol='source_vc')

In [13]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [14]:
input_cols = ['created_at','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count']

In [15]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

In [16]:
gbt = GBTClassifier(featuresCol='features',labelCol='fake')

In [17]:
pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
                            tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
                            assembler,gbt])

In [18]:
gbt_model = pipeline.fit(trainDF)

In [19]:
predictions = gbt_model.transform(testDF)

In [20]:
predictions = predictions.select('fake','prediction')

In [21]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [22]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

Accuracy: 0.8810142075056648
F1: 0.8789758248777901
Precision: 0.8820599341682576
Recall: 0.8810142075056648
