In [14]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.types import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

In [10]:
pipeline = PretrainedPipeline.from_disk('entity_recognizer_lg_fr')

In [1]:
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [3]:
len(fr_stop)

507

In [52]:
fr_stop

{'a',
 'abord',
 'afin',
 'ah',
 'ai',
 'aie',
 'ainsi',
 'ait',
 'allaient',
 'allons',
 'alors',
 'anterieur',
 'anterieure',
 'anterieures',
 'antérieur',
 'antérieure',
 'antérieures',
 'apres',
 'après',
 'as',
 'assez',
 'attendu',
 'au',
 'aupres',
 'auquel',
 'aura',
 'auraient',
 'aurait',
 'auront',
 'aussi',
 'autre',
 'autrement',
 'autres',
 'autrui',
 'aux',
 'auxquelles',
 'auxquels',
 'avaient',
 'avais',
 'avait',
 'avant',
 'avec',
 'avoir',
 'avons',
 'ayant',
 'bas',
 'basee',
 'bat',
 "c'",
 'car',
 'ce',
 'ceci',
 'cela',
 'celle',
 'celle-ci',
 'celle-la',
 'celle-là',
 'celles',
 'celles-ci',
 'celles-la',
 'celles-là',
 'celui',
 'celui-ci',
 'celui-la',
 'celui-là',
 'cent',
 'cependant',
 'certain',
 'certaine',
 'certaines',
 'certains',
 'certes',
 'ces',
 'cet',
 'cette',
 'ceux',
 'ceux-ci',
 'ceux-là',
 'chacun',
 'chacune',
 'chaque',
 'chez',
 'ci',
 'cinq',
 'cinquantaine',
 'cinquante',
 'cinquantième',
 'cinquième',
 'combien',
 'comme',
 'comment',

In [46]:
list_device = ["Iphone", "Iphone de Julien", "Iphone X", "Iphone De Nicolas", "Iphone's Antoine", \
                    "Apple Watch De Pierre", "Iphone de Romain", "Apple Watch de Pierre", "Iphone De Damien", "Iphone", \
                    "Doom 8", "Iphone Luc", "Iphone old", "Macbook Pro de Alex", "Apple Watch's Simon", \
                    "Macbook Pro d'Alex", "Iphone De Clément", "Iphone De Gru", "Machin", "Iphone 6 Fafa"]

In [47]:
df_device = spark.createDataFrame(data=[[t] for t in list_device], schema=['text'])

In [48]:
def update_text(text):
    result = []
    for t in text.lower().split(" "):
        if t not in fr_stop:
            result.append(t.capitalize())
        else:
            result.append(t)
    return " ".join(result)

my_udf = F.udf(lambda x: update_text(x), StringType())

In [49]:
df_device = df_device.withColumn("new_text", my_udf(df_device.text)).select("text", "new_text")
df_device.show(n = 50, truncate = False)

+---------------------+---------------------+
|text                 |new_text             |
+---------------------+---------------------+
|Iphone               |Iphone               |
|Iphone de Julien     |Iphone de Julien     |
|Iphone X             |Iphone X             |
|Iphone De Nicolas    |Iphone de Nicolas    |
|Iphone's Antoine     |Iphone's Antoine     |
|Apple Watch De Pierre|Apple Watch de Pierre|
|Iphone de Romain     |Iphone de Romain     |
|Apple Watch de Pierre|Apple Watch de Pierre|
|Iphone De Damien     |Iphone de Damien     |
|Iphone               |Iphone               |
|Doom 8               |Doom 8               |
|Iphone Luc           |Iphone Luc           |
|Iphone old           |Iphone Old           |
|Macbook Pro de Alex  |Macbook Pro de Alex  |
|Apple Watch's Simon  |Apple Watch's Simon  |
|Macbook Pro d'Alex   |Macbook Pro D'alex   |
|Iphone De Clément    |Iphone de Clément    |
|Iphone De Gru        |Iphone de Gru        |
|Machin               |Machin     

In [50]:
annotations = pipeline.transform(df_device.select("text"))
result = annotations.select(F.col("text"), F.col("ner.result").alias("ner"), F.col("entities.result").alias("text ner_chunk"), F.col("entities.metadata").alias("ner_chunk"))
result.show(n = 50, truncate = False)

+---------------------+----------------------------+-----------------------+-----------------------------------------------------------------------------------------+
|text                 |ner                         |text ner_chunk         |ner_chunk                                                                                |
+---------------------+----------------------------+-----------------------+-----------------------------------------------------------------------------------------+
|Iphone               |[I-MISC]                    |[Iphone]               |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            |
|Iphone de Julien     |[I-MISC, O, I-PER]          |[Iphone, Julien]       |[{entity -> MISC, sentence -> 0, chunk -> 0}, {entity -> PER, sentence -> 0, chunk -> 1}]|
|Iphone X             |[I-MISC, I-MISC]            |[Iphone X]             |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            

In [51]:
annotations = pipeline.transform(df_device.select("new_text").toDF("text"))
result = annotations.select(F.col("text"), F.col("ner.result").alias("ner"), F.col("entities.result").alias("text ner_chunk"), F.col("entities.metadata").alias("ner_chunk"))
result.show(n = 50, truncate = False)

+---------------------+--------------------------+---------------------+-----------------------------------------------------------------------------------------+
|text                 |ner                       |text ner_chunk       |ner_chunk                                                                                |
+---------------------+--------------------------+---------------------+-----------------------------------------------------------------------------------------+
|Iphone               |[I-MISC]                  |[Iphone]             |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            |
|Iphone de Julien     |[I-MISC, O, I-PER]        |[Iphone, Julien]     |[{entity -> MISC, sentence -> 0, chunk -> 0}, {entity -> PER, sentence -> 0, chunk -> 1}]|
|Iphone X             |[I-MISC, I-MISC]          |[Iphone X]           |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            |
|Iphone de Nicolas    