In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline 
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC



from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import warnings
warnings.filterwarnings("ignore")

In [5]:
# Initialize Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# df = spark.read.csv("./data_new/gossipcop_news.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)
df = spark.read.csv("./data_new/politifact_with_users.csv",header=True, sep = ',',multiLine=True, escape = "\"",inferSchema =True)

In [7]:
df.dtypes

[('tweet_id', 'bigint'),
 ('created_at_tweet', 'int'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'int'),
 ('created_at_user', 'double'),
 ('friends_count', 'double'),
 ('listed_count', 'double'),
 ('favourites_count', 'double'),
 ('geo_enabled', 'boolean'),
 ('statuses_count', 'double'),
 ('has_extended_profile', 'boolean'),
 ('default_profile', 'boolean')]

In [8]:
df.show(10)

+------------------+----------------+--------------+-------------+------------------+--------------------+--------+---------------+--------------------+--------------------+----+-------+---------------+-------------+------------+----------------+-----------+--------------+--------------------+---------------+
|          tweet_id|created_at_tweet|favorite_count|retweet_count|           user_id|            location|verified|followers_count|              source|                text|fake|news_id|created_at_user|friends_count|listed_count|favourites_count|geo_enabled|statuses_count|has_extended_profile|default_profile|
+------------------+----------------+--------------+-------------+------------------+--------------------+--------+---------------+--------------------+--------------------+----+-------+---------------+-------------+------------+----------------+-----------+--------------+--------------------+---------------+
| 53186786549182464|      1301515692|             0|            0| 

In [9]:
df = df.select('text','source','fake', 'favorite_count', 'retweet_count', 'verified', 'followers_count', 'friends_count', 'listed_count', 'favourites_count', 'geo_enabled', 'statuses_count', 'has_extended_profile', 'default_profile')

In [10]:
df = df.dropna(subset=('text'))
df = df.fillna("", subset=('source'))
df = df.fillna(0, subset = ('favorite_count', 'retweet_count', 'verified', 'followers_count', 'friends_count', 'listed_count', 'favourites_count', 'geo_enabled', 'statuses_count', 'has_extended_profile', 'default_profile'))
df = df.dropna()

12566


In [11]:
tokenizer_text = Tokenizer(inputCol='text',outputCol='text1')
stopwords_remover_text = StopWordsRemover(inputCol='text1',outputCol='text2')
vectorizer_text = CountVectorizer(inputCol='text2',outputCol='text3')
idf_text = IDF(inputCol='text3',outputCol='text_vc')

In [12]:
tokenizer_domain = Tokenizer(inputCol='source',outputCol='domain1')
stopwords_remover_domain = StopWordsRemover(inputCol='domain1',outputCol='domain2')
vectorizer_domain = CountVectorizer(inputCol='domain2',outputCol='domain3')
idf_domain = IDF(inputCol='domain3',outputCol='domain_vc')

In [13]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

In [14]:
assembler = VectorAssembler(inputCols=['text_vc','domain_vc', 'favorite_count', 'retweet_count', 'verified', 'followers_count', 'friends_count', 'listed_count', 'favourites_count', 'geo_enabled', 'statuses_count', 'has_extended_profile', 'default_profile'],outputCol="features")

In [15]:
# featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")
# labelIndexer = StringIndexer(inputCol="fake", outputCol="indexedLabel")

In [16]:
rf = RandomForestClassifier(labelCol="fake", featuresCol="features", numTrees=10)

In [17]:
# pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
#                             tokenizer_source,stopwords_remover_source,vectorizer_source,idf_source,
#                             tokenizer_location,stopwords_remover_location,vectorizer_location,idf_location,
#                             assembler,labelIndexer, featureIndexer, rf])

pipeline = Pipeline(stages=[tokenizer_text,stopwords_remover_text,vectorizer_text,idf_text,
                            tokenizer_domain,stopwords_remover_domain,vectorizer_domain,idf_domain,
                            assembler, rf])

In [18]:
model = pipeline.fit(trainDF)


In [19]:
predictions = model.transform(testDF)

In [20]:
predictions = predictions.select('fake','prediction')

In [21]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='prediction',metricName='weightedRecall')

In [22]:
print('Accuracy:', evaluator_accuracy.evaluate(predictions))
print('F1:',evaluator_f1.evaluate(predictions))
print('Precision:',evaluator_precision.evaluate(predictions))
print('Recall:',evaluator_recall.evaluate(predictions))

Accuracy: 0.7236413043478261
F1: 0.6076168354022579
Precision: 0.523656737358223
Recall: 0.7236413043478261
