In [27]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.types import StringType
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

spark = sparknlp.start()

In [17]:
pipeline = PretrainedPipeline.from_disk('entity_recognizer_lg_fr') 

In [18]:
'''
list_device = ["iPhone", "Apple watch", "macbook pro", "iPhone's"]
list_nom = ["Pierre", "Romain", "Pierre", "Dung", "Luc", "Simon", "Gru", "Nicolas"]

list_device_name = []
for device in list_device:
        for nom in list_nom:
            list_device_name.append(device + " " + nom)
            list_device_name.append(device + " De " + nom)
            list_device_name.append(device + " de " + nom)
            list_device_name.append(device + " d'" + nom)
'''

In [44]:
list_device_name = ["Iphone", "Iphone de Julien", "Iphone X", "Iphone De Nicolas", "Iphone's Antoine", \
                    "Apple Watch De Pierre", "Iphone de Romain", "Apple Watch de Pierre", "Iphone De Damien", "Iphone", \
                    "Doom 8", "Iphone Luc", "Iphone old", "Macbook Pro de Alex", "Apple Watch's Simon", \
                    "Macbook Pro d'Alex", "Iphone De Clément", "Iphone De Gru", "Machin", "Iphone 6 Fafa"]

In [20]:
path = "data/data_set_final.csv"
df = spark.read.option("header","true").csv(path)

In [45]:
pd_df = pd.DataFrame()
pd_df["device"] = list_device_name
df_device = spark.createDataFrame(pd_df)

In [36]:
def is_unstructured(df, col):
    letter_avg = df.agg(F.mean(F.length(F.regexp_replace(F.col(col), '[^a-zA-Z]', "")))).collect()[0][0]
    word_avg = df.agg(F.mean(F.size(F.split(F.col(col), " ")))).collect()[0][0]
    return letter_avg >= 1.0 and word_avg > 1.0

In [24]:
def get_entity(df, column):
    data = df.select(column).toDF("text")
    annotations = pipeline.transform(data)
        
    result = annotations.withColumn("id", F.monotonically_increasing_id()).select("id", "text", "entities")
    result = result.select(F.col("id"), F.col("text"), F.col("entities.result").alias("result"), F.explode("entities.metadata"))
    result = result.select("id", "text", "result", F.col("col.entity").alias("predict"))
    result = result.groupby("id", "predict").agg(F.count("predict").alias("count"))

    w = Window.partitionBy("id").orderBy(F.col("count").desc())

    result = result.withColumn("row", F.row_number().over(w)) \
                        .filter(F.col("row") == 1) \
                        .drop("row", "count") \
                        .groupby("predict").count()
        
    result = result.toPandas()
        
    other_row = {'predict': 'OTHER', 'count': annotations.count() - sum(result.loc[result.predict != 'MISC']['count'])}
    result = result.append(other_row, ignore_index = True)
    result.drop(result[result.predict == 'MISC'].index, inplace = True)
    result.sort_values(by = ['count'], ascending = False, inplace = True)
    
    return result

In [28]:
def update_text(text):
    result = []
    for t in text.lower().split(" "):
        if t not in fr_stop:
            result.append(t.capitalize())
        else:
            result.append(t)
    return " ".join(result)

my_udf = F.udf(lambda x: update_text(x), StringType())

In [37]:
def run(df):
    for col in df.columns:
        if is_unstructured(df, col): 
            df = df.withColumn(col, my_udf(F.col(col)))
        print("############## " + col + " ##############")
        print(get_entity(df, col))

In [38]:
run(df)

############## type_de_carte ##############
  predict  count
0     ORG  29899
2   OTHER  10097
############## numero_de_carte ##############
  predict  count
1   OTHER  39996
############## iban ##############
  predict  count
2     LOC  23626
4   OTHER  15295
0     ORG    757
1     PER    318
############## prenom ##############
  predict  count
1     PER  29426
2     LOC   5325
4   OTHER   4419
0     ORG    826
############## nom ##############
  predict  count
1     PER  29587
2     LOC   6288
4   OTHER   3337
0     ORG    784
############## tel ##############
  predict  count
2   OTHER  39990
0     LOC      6
############## email ##############
  predict  count
4   OTHER  25153
1     PER   8103
2     LOC   3814
0     ORG   2926
############## date ##############
  predict  count
0   OTHER  39996
############## adresse ##############
  predict  count
2     LOC  31256
4   OTHER   6232
1     PER   1935
0     ORG    573


In [46]:
df_device = df_device.withColumn("device_updated", my_udf(F.col("device")))

In [47]:
annotations = pipeline.transform(df_device.select("device").toDF("text"))
result = annotations.select(F.col("text"), F.col("ner.result").alias("ner"), F.col("entities.result").alias("text ner_chunk"), F.col("entities.metadata").alias("ner_chunk"))
result.show(n = 200, truncate = False)

+---------------------+----------------------------+-----------------------+-----------------------------------------------------------------------------------------+
|text                 |ner                         |text ner_chunk         |ner_chunk                                                                                |
+---------------------+----------------------------+-----------------------+-----------------------------------------------------------------------------------------+
|Iphone               |[I-MISC]                    |[Iphone]               |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            |
|Iphone de Julien     |[I-MISC, O, I-PER]          |[Iphone, Julien]       |[{entity -> MISC, sentence -> 0, chunk -> 0}, {entity -> PER, sentence -> 0, chunk -> 1}]|
|Iphone X             |[I-MISC, I-MISC]            |[Iphone X]             |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            

In [48]:
annotations = pipeline.transform(df_device.select("device_updated").toDF("text"))
result = annotations.select(F.col("text"), F.col("ner.result").alias("ner"), F.col("entities.result").alias("text ner_chunk"), F.col("entities.metadata").alias("ner_chunk"))
result.show(n = 200, truncate = False)

+---------------------+--------------------------+---------------------+-----------------------------------------------------------------------------------------+
|text                 |ner                       |text ner_chunk       |ner_chunk                                                                                |
+---------------------+--------------------------+---------------------+-----------------------------------------------------------------------------------------+
|Iphone               |[I-MISC]                  |[Iphone]             |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            |
|Iphone de Julien     |[I-MISC, O, I-PER]        |[Iphone, Julien]     |[{entity -> MISC, sentence -> 0, chunk -> 0}, {entity -> PER, sentence -> 0, chunk -> 1}]|
|Iphone X             |[I-MISC, I-MISC]          |[Iphone X]           |[{entity -> MISC, sentence -> 0, chunk -> 0}]                                            |
|Iphone de Nicolas    

In [51]:
annotations = pipeline.transform(df.select("adresse").toDF("text"))
result = annotations.withColumn("id", F.monotonically_increasing_id()).select("id", "text", "entities")
result = result.select(F.col("id"), F.col("text"), F.col("entities.result").alias("result"), F.explode("entities.metadata"))
#result = result.select("id", "text", "result", F.col("col.entity").alias("predict"))
result.show(n = 200, truncate = False)

+---+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |text                             |entities                                                                                                                                                      |
+---+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |Route de Trevoux                 |[{chunk, 0, 15, Route de Trevoux, {entity -> LOC, sentence -> 0, chunk -> 0}, []}]                                                                            |
|1  |Rue Sainte Marie                 |[{chunk, 0, 15, Rue Sainte Marie, {entity -> LOC, sentence -> 0, chunk -> 0}, []}]                                                                            |
|2  |

In [68]:
annotations = pipeline.transform(df.select("adresse").toDF("text"))
result = annotations.withColumn("id", F.monotonically_increasing_id()).select("id", "text", "entities")
result = result.select(F.col("id"), F.col("text"), F.col("entities.result").alias("result"), F.explode("entities.metadata"))
result = result.select("id", "text", "result", F.col("col.entity").alias("predict"))
result.show(n = 200, truncate = False)

+---+---------------------------------+---------------------------------+-------+
|id |text                             |result                           |predict|
+---+---------------------------------+---------------------------------+-------+
|0  |Route de Trevoux                 |[Route de Trevoux]               |LOC    |
|1  |Rue Sainte Marie                 |[Rue Sainte Marie]               |LOC    |
|2  |1711 Route d’Hauteville          |[Route d’Hauteville]             |LOC    |
|3  |Route du Morbier                 |[Route du Morbier]               |LOC    |
|4  |Rue des Bleuets                  |[Rue des Bleuets]                |LOC    |
|5  |7 Rue Jean Monnet                |[7 Rue Jean Monnet]              |LOC    |
|6  |193 Chemin de Chavagneux         |[Chemin de Chavagneux]           |LOC    |
|7  |55 Rue du Coteau                 |[55 Rue du Coteau]               |LOC    |
|8  |Rue de l’Eglise                  |[Rue de l’Eglise]                |LOC    |
|9  |89 Impasse 

In [69]:
result = result.groupby("id").agg(F.collect_set('result'), F.collect_list('predict'))
result.orderBy('id').show(n = 200, truncate = False)

+---+-----------------------------------+---------------------+
|id |collect_set(result)                |collect_list(predict)|
+---+-----------------------------------+---------------------+
|0  |[[Route de Trevoux]]               |[LOC]                |
|1  |[[Rue Sainte Marie]]               |[LOC]                |
|2  |[[Route d’Hauteville]]             |[LOC]                |
|3  |[[Route du Morbier]]               |[LOC]                |
|4  |[[Rue des Bleuets]]                |[LOC]                |
|5  |[[7 Rue Jean Monnet]]              |[LOC]                |
|6  |[[Chemin de Chavagneux]]           |[LOC]                |
|7  |[[55 Rue du Coteau]]               |[LOC]                |
|8  |[[Rue de l’Eglise]]                |[LOC]                |
|9  |[[89 Impasse des Chênes]]          |[MISC]               |
|10 |[[Rue du Journans]]                |[LOC]                |
|11 |[[Rue de la Rogeraie]]             |[LOC]                |
|12 |[[Bourg]]                          