In [89]:
# https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_FR.ipynb
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

# Data set

In [32]:
path = "data/data_set_final.csv"

In [33]:
df = spark.read.option("header","true").csv(path)
df = spark.createDataFrame(df.head(1000))

In [34]:
data = df.select('adresse').toDF("text")
data.show(n=5, truncate = False)

+-------------------------------------+
|text                                 |
+-------------------------------------+
|6 Place du Pese Lait                 |
|1 Les Granges                        |
|2 Les Granges                        |
|10 Les Granges                       |
|1082 Chemin de la Ferm de la Montagne|
+-------------------------------------+
only showing top 5 rows



In [35]:
empty_df = spark.createDataFrame([['']]).toDF('text')

# 1. Document Assembler and Tokenizer

In [6]:
document_assembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

In [36]:
pipeline_1 = Pipeline(stages=[
    document_assembler, 
    tokenizer
])
model_1 = pipeline_1.fit(empty_df)
result_1 = model_1.transform(data)

In [37]:
# https://nlp.johnsnowlabs.com/docs/en/concepts#documentassembler-getting-data-in
# https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.base.DocumentAssembler.html
# https://nlp.johnsnowlabs.com/api/python/modules/sparknlp/base.html#DocumentAssembler
result_1.select("document").show(n = 5, truncate = False)

+-------------------------------------------------------------------------------+
|document                                                                       |
+-------------------------------------------------------------------------------+
|[{document, 0, 19, 6 Place du Pese Lait, {sentence -> 0}, []}]                 |
|[{document, 0, 12, 1 Les Granges, {sentence -> 0}, []}]                        |
|[{document, 0, 12, 2 Les Granges, {sentence -> 0}, []}]                        |
|[{document, 0, 13, 10 Les Granges, {sentence -> 0}, []}]                       |
|[{document, 0, 36, 1082 Chemin de la Ferm de la Montagne, {sentence -> 0}, []}]|
+-------------------------------------------------------------------------------+
only showing top 5 rows



In [38]:
result_1.select("token").show(n = 5, truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|token                                                                                                                                                                                                                                                                                                                                                      |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [39]:
result_1.select("document").printSchema()

root
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



# 2. Word Embedding

In [10]:
MODEL_NAME = "wikiner_840B_300"

In [11]:
if MODEL_NAME == "wikiner_840B_300":
    embeddings = WordEmbeddingsModel.pretrained('glove_840B_300', lang='xx') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('embeddings')
elif MODEL_NAME == "wikiner_6B_300":
    embeddings = WordEmbeddingsModel.pretrained('glove_6B_300', lang='xx') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('embeddings')
elif MODEL_NAME == "wikiner_6B_100":
    embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('embeddings')

glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[OK!]


In [40]:
pipeline_2 = Pipeline(stages=[
    embeddings
])
model_2 = pipeline_2.fit(empty_df)
result_2 = model_2.transform(result_1)

In [28]:
result_2.select("embeddings").show(n = 5, truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# 3. NER model

In [14]:
ner_model = NerDLModel.pretrained(MODEL_NAME, 'fr') \
    .setInputCols(['document', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

wikiner_840B_300 download started this may take some time.
Approximate size to download 14.4 MB
[OK!]


In [41]:
nlp_pipeline = Pipeline(stages=[
    ner_model,
    ner_converter
])
model = nlp_pipeline.fit(empty_df)
result = model.transform(result_2)

In [58]:
result.select("text", "ner").show(n = 5, truncate = False)

+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                 |ner                                                                                                                                                                                                                                                                                                                                                                                                              

In [43]:
result.select("ner_chunk").show(n = 5, truncate = False)

+--------------------------------------------------------------------------------------------------------------------------------------------------+
|ner_chunk                                                                                                                                         |
+--------------------------------------------------------------------------------------------------------------------------------------------------+
|[{chunk, 2, 6, Place, {entity -> LOC, sentence -> 0, chunk -> 0}, []}, {chunk, 11, 19, Pese Lait, {entity -> LOC, sentence -> 0, chunk -> 1}, []}]|
|[{chunk, 6, 12, Granges, {entity -> LOC, sentence -> 0, chunk -> 0}, []}]                                                                         |
|[{chunk, 6, 12, Granges, {entity -> LOC, sentence -> 0, chunk -> 0}, []}]                                                                         |
|[{chunk, 7, 13, Granges, {entity -> LOC, sentence -> 0, chunk -> 0}, []}]                                

In [81]:
result.select("ner_chunk.result", "ner_chunk.metadata").show(n = 5, truncate = False)

+----------------------------------+----------------------------------------------------------------------------------------+
|result                            |metadata                                                                                |
+----------------------------------+----------------------------------------------------------------------------------------+
|[Place, Pese Lait]                |[{entity -> LOC, sentence -> 0, chunk -> 0}, {entity -> LOC, sentence -> 0, chunk -> 1}]|
|[Granges]                         |[{entity -> LOC, sentence -> 0, chunk -> 0}]                                            |
|[Granges]                         |[{entity -> LOC, sentence -> 0, chunk -> 0}]                                            |
|[Granges]                         |[{entity -> LOC, sentence -> 0, chunk -> 0}]                                            |
|[Chemin de la Ferm de la Montagne]|[{entity -> LOC, sentence -> 0, chunk -> 0}]                                      

In [83]:
test = result.select(F.col("text"), F.col("ner_chunk.result").alias("result"), F.explode("ner_chunk.metadata"))
test.show(n = 20, truncate = False)

+-------------------------------------+---------------------------------------+-------------------------------------------+
|text                                 |result                                 |col                                        |
+-------------------------------------+---------------------------------------+-------------------------------------------+
|6 Place du Pese Lait                 |[Place, Pese Lait]                     |{entity -> LOC, sentence -> 0, chunk -> 0} |
|6 Place du Pese Lait                 |[Place, Pese Lait]                     |{entity -> LOC, sentence -> 0, chunk -> 1} |
|1 Les Granges                        |[Granges]                              |{entity -> LOC, sentence -> 0, chunk -> 0} |
|2 Les Granges                        |[Granges]                              |{entity -> LOC, sentence -> 0, chunk -> 0} |
|10 Les Granges                       |[Granges]                              |{entity -> LOC, sentence -> 0, chunk -> 0} |
|1082 Ch

In [86]:
test_2 = test.select("text", "result", "col.entity")
test_2.orderBy("text").show(n = 20, truncate = False)

+----------------------------+-------------------------+------+
|text                        |result                   |entity|
+----------------------------+-------------------------+------+
|1 Impasse des Rosiers       |[1 Impasse des Rosiers]  |MISC  |
|1 Le Clos des Charmilles    |[Clos des Charmilles]    |LOC   |
|1 Les Granges               |[Granges]                |LOC   |
|1 Lieu Dit les Bruyeres     |[1 Lieu Dit les Bruyeres]|MISC  |
|1 Lot les Jardins D Agathe  |[1 Lot, Jardins D Agathe]|LOC   |
|1 Lot les Jardins D Agathe  |[1 Lot, Jardins D Agathe]|LOC   |
|1 Lotissement les Amandiers |[Amandiers]              |MISC  |
|1 Place de la Mairie        |[1 Place, Mairie]        |LOC   |
|1 Place de la Mairie        |[1 Place, Mairie]        |LOC   |
|1 Route de Rance            |[1 Route de Rance]       |LOC   |
|1 Route du Mont             |[1 Route du Mont]        |LOC   |
|1 Rue Dangeville            |[1 Rue Dangeville]       |LOC   |
|10 Chemin Louis Lumiere     |[10 Chemin

In [87]:
test_3 = test_2.groupby("text", "entity").agg(F.count("entity").alias("count")).orderBy("text")
test_3.show(n = 20, truncate = False)

+----------------------------+------+-----+
|text                        |entity|count|
+----------------------------+------+-----+
|1 Impasse des Rosiers       |MISC  |1    |
|1 Le Clos des Charmilles    |LOC   |1    |
|1 Les Granges               |LOC   |1    |
|1 Lieu Dit les Bruyeres     |MISC  |1    |
|1 Lot les Jardins D Agathe  |LOC   |2    |
|1 Lotissement les Amandiers |MISC  |1    |
|1 Place de la Mairie        |LOC   |2    |
|1 Route de Rance            |LOC   |1    |
|1 Route du Mont             |LOC   |1    |
|1 Rue Dangeville            |LOC   |1    |
|10 Chemin Louis Lumiere     |MISC  |1    |
|10 Chemin des Vermots       |MISC  |1    |
|10 Impasse Moreau           |MISC  |1    |
|10 Le Bourg                 |LOC   |1    |
|10 Les Granges              |LOC   |1    |
|10 Lotissement les Amandiers|MISC  |1    |
|10 Ruelle des Vignerons     |LOC   |1    |
|100 Impasse Bellevue        |LOC   |1    |
|100 Rue Dangeville          |PER   |2    |
|1000 Le Buisson             |MI

In [91]:
w2 = Window.partitionBy("text").orderBy(F.col("count").desc())

test_4 = test_3.withColumn("row", F.row_number().over(w2)) \
                .filter(F.col("row") == 1) \
                .drop("row").orderBy("text") \
                .groupby("entity").count(). \
                orderBy(F.col("count").desc())

test_4.show(n = 20, truncate = False)

+------+-----+
|entity|count|
+------+-----+
|LOC   |620  |
|MISC  |322  |
|PER   |35   |
|ORG   |1    |
+------+-----+

