In [None]:
import findspark
findspark.init()

import sparknlp
spark = sparknlp.start(gpu=True, memory="12G")

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand 

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

In [2]:
fn = "data/politifact_news.csv"

dataset = spark.read.csv(fn, inferSchema=True, header=True, multiLine=True, escape = "\"") \
                        .filter("text is not NULL").select("news_id", "text", "fake")
 
train_dataset, valid_dataset = dataset.randomSplit([0.7, 0.3], seed=42)
train_dataset = train_dataset.repartition(32, "news_id")
valid_dataset = valid_dataset.repartition(16, "news_id")
print("Train Dataset Count: " + str(train_dataset.count()))
print("Valid Dataset Count: " + str(valid_dataset.count()))

Train Dataset Count: 594
Valid Dataset Count: 207


In [3]:
embeddings = LongformerEmbeddings \
      .load("models/longformer_large_4096") \
      .setBatchSize(128) \
      .setInputCols(["document", "token"]) \
      .setOutputCol("embeddings") \
      .setCaseSensitive(True) \
      .setMaxSentenceLength(4096)

In [4]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") \
    .setIdCol("news_id")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

text_emb = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("text_embeddings") \
    .setPoolingStrategy("AVERAGE")

clf = ClassifierDLApproach() \
    .setBatchSize(32) \
    .setInputCols(["text_embeddings"]) \
    .setOutputCol("pred") \
    .setLabelColumn("fake") \
    .setLr(1e-4) \
    .setDropout(.0) \
    .setEnableOutputLogs(True) \
    .setOutputLogsPath("./logs/") 

clf_pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    embeddings,
    text_emb,
    clf
])

In [5]:
model = clf_pipeline.fit(train_dataset)

In [6]:
import os
from datetime import datetime

clf.save(f"models/{os.path.basename(fn).split('_')[0]}-clf-{datetime.now().strftime('%m%d-%H%M')}")

In [8]:
results = model.transform(valid_dataset)

In [21]:
results.select('pred').show(200, False)

+---------------------------------------------------------------------------------+
|pred                                                                             |
+---------------------------------------------------------------------------------+
|[{category, 0, 713, 0, {sentence -> 0, 0 -> 0.84372944, 1 -> 0.15627052}, []}]   |
|[{category, 0, 278, 0, {sentence -> 0, 0 -> 0.8100411, 1 -> 0.18995881}, []}]    |
|[{category, 0, 2701, 0, {sentence -> 0, 0 -> 0.7773093, 1 -> 0.22269067}, []}]   |
|[{category, 0, 18133, 0, {sentence -> 0, 0 -> 0.8696387, 1 -> 0.13036135}, []}]  |
|[{category, 0, 2282, 1, {sentence -> 0, 0 -> 0.47821307, 1 -> 0.5217869}, []}]   |
|[{category, 0, 688, 1, {sentence -> 0, 0 -> 0.33451992, 1 -> 0.6654801}, []}]    |
|[{category, 0, 1193, 1, {sentence -> 0, 0 -> 0.38184172, 1 -> 0.6181583}, []}]   |
|[{category, 0, 2591, 1, {sentence -> 0, 0 -> 0.3750379, 1 -> 0.62496215}, []}]   |
|[{category, 0, 3768, 0, {sentence -> 0, 0 -> 0.91992533, 1 -> 0.08007468}, 

In [22]:
results.dtypes

[('news_id', 'int'),
 ('text', 'string'),
 ('fake', 'int'),
 ('document',
  'array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>'),
 ('token',
  'array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>'),
 ('embeddings',
  'array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>'),
 ('text_embeddings',
  'array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>'),
 ('pred',
  'array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>')]

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

results = results.withColumn('pred_', results['pred'][0].result.cast('double'))
evaluator = MulticlassClassificationEvaluator(labelCol='fake',predictionCol='pred_',metricName='accuracy')
accuracy = evaluator.evaluate(results)

In [27]:
accuracy

0.9130434782608695