In [7]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline 
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC



from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import warnings
warnings.filterwarnings("ignore")

In [47]:
# Initialize Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.csv("./data_new/gossipcop_tweets_processed.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)
# df = df.sample(False, 0.1, seed=0)

In [49]:
df.dtypes

[('tweet_id', 'bigint'),
 ('created_at', 'string'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'bigint')]

In [50]:
df = df.select('created_at','text','location','verified','source', 'followers_count','retweet_count','favorite_count','fake')

In [51]:
df = df.withColumn('created_at', to_timestamp(col('created_at'), 'yyyy-MM-dd HH:mm:ss'))

In [52]:
df = df.withColumn('created_at', df.created_at.cast("long"))
df = df.withColumn('retweet_count', df.retweet_count.cast("long"))
df = df.withColumn('favorite_count', df.favorite_count.cast("long"))
df = df.withColumn('fake', df.fake.cast("int"))

In [53]:
df.dtypes

[('created_at', 'bigint'),
 ('text', 'string'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('source', 'string'),
 ('followers_count', 'double'),
 ('retweet_count', 'bigint'),
 ('favorite_count', 'bigint'),
 ('fake', 'int')]

In [54]:
df = df.dropna(subset=('text'))
df = df.fillna('null', subset = ('source', 'location'))
df = df.fillna(0, subset = ('created_at', 'followers_count','retweet_count','favorite_count'))
df = df.fillna(False, subset = ('verified'))

In [55]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [56]:
tokenizer_location = Tokenizer(inputCol='location',outputCol='location1')
stopwords_remover_location = StopWordsRemover(inputCol='location1',outputCol='location2')
vectorizer_location = CountVectorizer(inputCol='location2',outputCol='location3')
idf_location = IDF(inputCol='location3',outputCol='location_vc')

In [57]:
tokenizer_source = Tokenizer(inputCol='source',outputCol='source1')
stopwords_remover_source = StopWordsRemover(inputCol='source1',outputCol='source2')
vectorizer_source = CountVectorizer(inputCol='source2',outputCol='source3')
idf_source = IDF(inputCol='source3',outputCol='source_vc')

In [58]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [59]:
input_cols = ['created_at','text_vc','location_vc','verified','source_vc', 'followers_count','retweet_count','favorite_count']

In [60]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

In [61]:
# featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")
# labelIndexer = StringIndexer(inputCol="fake", outputCol="indexedLabel")

In [62]:
rf = RandomForestClassifier(labelCol="fake", featuresCol="features", numTrees=10)

In [63]:
# pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
#                             tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
#                             tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
#                             assembler,labelIndexer, featureIndexer, rf])

pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
                            tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
                            assembler, rf])

In [64]:
model = pipeline.fit(trainDF)


In [65]:
predictions = model.transform(testDF)

In [66]:
predictions = predictions.select('fake','prediction')

In [67]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [68]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

Accuracy: 0.6255892994189234
F1: 0.4816746953556203
Precision: 0.765795729868441
Recall: 0.6255892994189234
