In [1]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

In [2]:
path = "data/data_set_final.csv"

In [13]:
# transforms raw text into a Document type
document_assembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document') 

# breaks sentences down into individual components (e.g., words or expressions)
tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

########## GLOVE + NerDLModel ###############

'''
embeddings = WordEmbeddingsModel.pretrained('glove_840B_300', lang='xx') \
    .setInputCols(['document', 'token']) \
    .setOutputCol('embeddings')

ner_model = NerDLModel.pretrained("wikiner_840B_300", 'fr') \
    .setInputCols(['document', 'token', 'embeddings']) \
    .setOutputCol('ner')
'''
'''
embeddings = WordEmbeddingsModel.pretrained('glove_6B_300', lang='xx') \
    .setInputCols(['document', 'token']) \
    .setOutputCol('embeddings')

ner_model = NerDLModel.pretrained("wikiner_6B_300", 'fr') \
    .setInputCols(['document', 'token', 'embeddings']) \
    .setOutputCol('ner')
'''

embeddings = BertEmbeddings.pretrained('bert_base_cased')\
        .setInputCols(["document", "token"])\
        .setOutputCol("embeddings")

ner_model = NerDLModel.pretrained("ner_dl_bert", 'en') \
        .setInputCols(["document", "token", "embeddings"]) \
        .setOutputCol("ner")

ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('entities') # ner_chunk

nlp_pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    embeddings,
    ner_model,
    ner_converter
])

empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline = nlp_pipeline.fit(empty_df)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]
ner_dl_bert download started this may take some time.
Approximate size to download 15.4 MB
[OK!]


In [14]:
def get_entity(column, df):
    data = df.select(column).toDF("text")
    annotations = pipeline.transform(data)
    
    result = annotations.select(F.col("text"), F.col("entities.result").alias("result"), F.explode("entities.metadata"))
    result = result.select("text", "result", "col.entity")
    result = result.groupby("text", "entity").agg(F.count("entity").alias("count")).orderBy("text")
    w2 = Window.partitionBy("text").orderBy(F.col("count").desc())

    result = result.withColumn("row", F.row_number().over(w2)) \
                .filter(F.col("row") == 1) \
                .drop("row").orderBy("text") \
                .groupby("entity").count(). \
                orderBy(F.col("count").desc())
    
    return result

In [15]:
# wikiner_840B_300 = 14506 (glove = 2.3gb, wikiner = 15mb) = lg
# wikiner_6B_300 = 12150 (450mb) = md

df = spark.read.option("header","true").csv(path)
get_entity("adresse", df).show()

KeyboardInterrupt: 