In [1]:
import os
from datetime import datetime

import findspark
findspark.init()

import sparknlp
spark = sparknlp.start(gpu=True, memory="12G")

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline, Transformer
from pyspark.sql.functions import rand, element_at, concat_ws, col, lit, expr
from pyspark.ml.feature import VectorAssembler, VectorSizeHint
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.3.1
Apache Spark version: 3.2.0


In [21]:
fn = "data/politifact_tweets_processed.csv"

dataset = spark.read.csv(fn, inferSchema=True, header=True, multiLine=True, escape = '"').filter("text is not NULL")
dataset = dataset.withColumn('text', concat_ws(' | ', col('location'), col('text')))

avgs = {col: 'mean' for col in dataset.schema.names if col.endswith('_count')}
aggs = [expr(f'percentile({col}, array(0.5))')[0].alias(f'{col}') for col in dataset.schema.names if col.endswith('_count')]
meds = dataset.agg(*aggs).toPandas().to_dict(orient = 'list')
for column, med in meds.items():
    dataset = dataset.fillna(med[0], subset = [column])

dataset = dataset.fillna(False, subset=[item[0] for item in dataset.dtypes if item[1].startswith('bool')]) 
 
train_dataset, valid_dataset = dataset.randomSplit([0.7, 0.3], seed=42)
train_dataset = train_dataset.repartition(32, "news_id")
valid_dataset = valid_dataset.repartition(32, "news_id")
print("Train Dataset Count: " + str(train_dataset.count()))
print("Valid Dataset Count: " + str(valid_dataset.count()))

Train Dataset Count: 295652
Valid Dataset Count: 126949


In [3]:
dataset.dtypes

[('tweet_id', 'bigint'),
 ('created_at', 'string'),
 ('favorite_count', 'int'),
 ('retweet_count', 'int'),
 ('user_id', 'bigint'),
 ('location', 'string'),
 ('verified', 'boolean'),
 ('followers_count', 'double'),
 ('source', 'string'),
 ('text', 'string'),
 ('fake', 'double'),
 ('news_id', 'int')]

In [4]:
embeddings = LongformerEmbeddings \
      .load("models/longformer_base_4096") \
      .setBatchSize(1024) \
      .setInputCols(["document", "token"]) \
      .setOutputCol("embeddings") \
      .setCaseSensitive(True) \
      .setMaxSentenceLength(4096)

In [5]:
class DropDim(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    def __init__(self):
        super(DropDim, self).__init__()

    def _transform(self, df):
        df = df.withColumn('emb_feature', element_at(df['emb_feature'], 1))
        return df

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") \
    .setIdCol("news_id")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

text_emb = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("text_embeddings") \
    .setPoolingStrategy("AVERAGE")

emb_feature = EmbeddingsFinisher() \
   .setInputCols("text_embeddings") \
   .setOutputCols("emb_feature") \
   .setOutputAsVector(True)


assembler = VectorAssembler(
    inputCols=['emb_feature', 'favorite_count', 'retweet_count', 
               'verified', 'followers_count', 
              ], 
    outputCol="features",
)

clf = MultilayerPerceptronClassifier(
    labelCol='fake', maxIter=30, blockSize=1024, stepSize=1e-4, solver='gd', layers=[772, 128, 2], seed=42)

clf_pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    embeddings,
    text_emb,
    emb_feature,
    DropDim(),
    assembler,
    clf
])

In [6]:
model = clf_pipeline.fit(train_dataset)
model.save(f"models/{os.path.basename(fn).split('.')[0]}-clf-{datetime.now().strftime('%m%d-%H%M')}")

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol='fake', predictionCol='prediction', metricName='accuracy')


train_results = model.transform(train_dataset)
train_acc = evaluator.evaluate(train_results)
print(train_acc)

0.7311431444854174


In [23]:
evaluator = MulticlassClassificationEvaluator(labelCol='fake', predictionCol='prediction', metricName='accuracy')

valid_results = model.transform(valid_dataset)
valid_acc = evaluator.evaluate(valid_results)
print(valid_acc)

0.7304114250604573
