In [1]:
from pyspark.sql import SparkSession
#from pyspark.sql.types import IntegerType, ArrayType, StringType, DoubleType, MapType
import pyspark.sql.functions as f
from pyspark.sql.window import Window

from sparknlp.base import DocumentAssembler, Pipeline, LightPipeline
from sparknlp.annotator import (
    SentenceDetector,
    Tokenizer,
    YakeKeywordExtraction
)

In [2]:
spark = SparkSession.builder \
    .appName("SparkNLP-Yake")\
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "25G") \
    .config("spark.exceturor.memovyOverhead", "5G") \
    .config("spark.executor.cores", "20") \
    .config("spark.executor.cores.max", "20") \
    .config("spark.driver.memory", "30G") \

    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.4")\
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jpietraszek/.ivy2/cache
The jars for the packages stored in: /home/jpietraszek/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1c7f36c1-8d18-422f-8521-e51dd6744d0c;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.4 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.gu

In [3]:
%%time 
df = spark.read.json('/user/jpietraszek/arxiv-metadata-oai-snapshot.json')

                                                                                

CPU times: user 9.99 ms, sys: 3.57 ms, total: 13.6 ms
Wall time: 9.41 s


In [7]:
%%time 

df = df.na.drop()
# Optimized DataFrame operations
df = df.select('id', 'abstract', 'categories') \
       .filter(~f.col("categories").like("% %"))


# Combine transformations into a single pipeline
yake_pipeline = Pipeline(stages=[
    DocumentAssembler().setInputCol("abstract").setOutputCol("document"),
    SentenceDetector().setInputCols("document").setOutputCol("sentence"),
    Tokenizer().setInputCols("sentence").setOutputCol("token").setContextChars(["(", ")", "?", "!", ".", ","]),
    YakeKeywordExtraction().setInputCols("token").setOutputCol("keywords").setMinNGrams(2).setMaxNGrams(3)
])

# Fit the pipeline to get the model
yake_model = yake_pipeline.fit(df)

# Use the fitted model to transform the DataFrame
result_df = yake_model.transform(df)

# Simplify the selection of relevant columns and casting the score column
exploded_df = result_df.select("categories", f.explode("keywords").alias("keyword")) \
                      .select("categories", "keyword.result", f.col("keyword.metadata.score").cast("float").alias("score"))

# Combine filter and drop operations
top_ten_keywords_per_category = exploded_df.drop_duplicates().withColumn("rank", f.row_number().over(Window.partitionBy("categories").orderBy(f.asc("score")))) \
                                          .filter("rank <= 10").drop("rank")

# Distinct to get unique keywords
distinct_keywords = top_ten_keywords_per_category.select("categories", "result", "score")


CPU times: user 45.1 ms, sys: 16.2 ms, total: 61.2 ms
Wall time: 325 ms


In [8]:
%%time
distinct_keywords.write.format('parquet').mode("overwrite").save("/user/jpietraszek/sparkYake_full_23ngram_wo_agg.parquet")



CPU times: user 51.1 ms, sys: 22.1 ms, total: 73.2 ms
Wall time: 41.3 s


                                                                                