In [12]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.types import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

spark = sparknlp.start()

In [13]:
pipeline = PretrainedPipeline.from_disk('entity_recognizer_lg_fr') 

In [14]:
path = "data/data_set_final.csv"
df = spark.read.option("header","true").csv(path)

In [15]:
def get_entity(column):
        data = df.select(column).toDF("text")
        annotations = pipeline.transform(data)
        
        result = annotations.withColumn("id", F.monotonically_increasing_id()).select("id", "text", "entities")
        result = result.select(F.col("id"), F.col("text"), F.col("entities.result").alias("result"), F.explode("entities.metadata"))
        result = result.select("id", "text", "result", F.col("col.entity").alias("predict"))
        result = result.groupby("id", "predict").agg(F.count("predict").alias("count"))

        w = Window.partitionBy("id").orderBy(F.col("count").desc())

        result = result.withColumn("row", F.row_number().over(w)) \
                        .filter(F.col("row") == 1) \
                        .drop("row", "count") \
                        .groupby("predict").count()
        
        result = result.toPandas()
        
        other_row = {'predict': 'OTHER', 'count': annotations.count() - sum(result.loc[result.predict != 'MISC']['count'])}
        result = result.append(other_row, ignore_index = True)
        result.drop(result[result.predict == 'MISC'].index, inplace = True)
        result.sort_values(by = ['count'], ascending = False, inplace = True)
        
        return result

In [16]:
def update_text(text):
    result = []
    for t in text.lower().split(" "):
        if t not in fr_stop:
            result.append(t.capitalize())
        else:
            result.append(t)
    return " ".join(result)

my_udf = F.udf(lambda x: update_text(x), StringType())

In [17]:
df = df.withColumn("adresse_lower", F.lower(F.col('adresse')))
df = df.withColumn("adresse_upper", F.upper(F.col('adresse')))
df = df.withColumn("adresse_initcap", F.initcap(F.col('adresse')))
df = df.withColumn("adresse_udfcap", my_udf(df.adresse))
df.select("adresse", "adresse_lower", "adresse_upper", "adresse_initcap", "adresse_udfcap").show(truncate = False)

+------------------------+------------------------+------------------------+------------------------+------------------------+
|adresse                 |adresse_lower           |adresse_upper           |adresse_initcap         |adresse_udfcap          |
+------------------------+------------------------+------------------------+------------------------+------------------------+
|Route de Trevoux        |route de trevoux        |ROUTE DE TREVOUX        |Route De Trevoux        |Route de Trevoux        |
|Rue Sainte Marie        |rue sainte marie        |RUE SAINTE MARIE        |Rue Sainte Marie        |Rue Sainte Marie        |
|1711 Route d’Hauteville |1711 route d’hauteville |1711 ROUTE D’HAUTEVILLE |1711 Route D’hauteville |1711 Route D’hauteville |
|Route du Morbier        |route du morbier        |ROUTE DU MORBIER        |Route Du Morbier        |Route du Morbier        |
|Rue des Bleuets         |rue des bleuets         |RUE DES BLEUETS         |Rue Des Bleuets         |Rue des Bl

In [20]:
df_test = df.select("adresse", "adresse_udfcap").filter(df.adresse != df.adresse_udfcap)
df_test.count()

3814

In [21]:
df_test.show(truncate = False)

+-------------------------------+-------------------------------+
|adresse                        |adresse_udfcap                 |
+-------------------------------+-------------------------------+
|1711 Route d’Hauteville        |1711 Route D’hauteville        |
|Rue de l’Eglise                |Rue de L’eglise                |
|3 Au Bourg                     |3 au Bourg                     |
|Lotissement Genevriers l’Europe|Lotissement Genevriers L’europe|
|La Rue                         |la Rue                         |
|Au Dessus de la Roche          |au dessus de la Roche          |
|3 Place Honore d’Urfe          |3 Place Honore D’urfe          |
|175 Rue de l’Etang             |175 Rue de L’etang             |
|4 Impasse du Quart d’Amont     |4 Impasse du Quart D’amont     |
|879 Chemin de l’Aigrefeuille   |879 Chemin de L’aigrefeuille   |
|Montee de l’Orme               |Montee de L’orme               |
|913 Chemin de l’Aigrefeuille   |913 Chemin de L’aigrefeuille   |
|Rue de l’

In [22]:
print(get_entity('adresse'))
print(get_entity("adresse_lower"))
print(get_entity("adresse_upper"))
print(get_entity("adresse_initcap"))
print(get_entity('adresse_udfcap'))

  predict  count
2     LOC  30992
4   OTHER   6490
1     PER   1978
0     ORG    536
  predict  count
4   OTHER  38768
2     LOC   1193
1     PER     20
0     ORG     15
  predict  count
2     LOC  22768
4   OTHER   8382
0     ORG   7572
1     PER   1274
  predict  count
2     LOC  26388
4   OTHER  10496
1     PER   2173
0     ORG    939
  predict  count
2     LOC  31256
4   OTHER   6232
1     PER   1935
0     ORG    573


In [23]:
print(get_entity('nom'))
print(get_entity('prenom'))

  predict  count
1     PER  29595
2     LOC   6252
4   OTHER   3354
0     ORG    795
  predict  count
1     PER  29560
2     LOC   5195
4   OTHER   4404
0     ORG    837


In [24]:
df = df.withColumn("nom_udfcap", my_udf(df.nom))
df = df.withColumn("prenom_udfcap", my_udf(df.prenom))

In [25]:
print(get_entity('nom_udfcap'))
print(get_entity('prenom_udfcap'))

  predict  count
1     PER  29587
2     LOC   6288
4   OTHER   3337
0     ORG    784
  predict  count
1     PER  29426
2     LOC   5325
4   OTHER   4419
0     ORG    826


In [29]:
df_test = df.select("nom", "nom_udfcap").filter(df.nom != df.nom_udfcap)
print(df_test.count())
df_test.show(truncate = False)

416
+------------+------------+
|nom         |nom_udfcap  |
+------------+------------+
|De Jong     |de Jong     |
|De Vries    |de Vries    |
|Da Silva    |da Silva    |
|De Boer     |de Boer     |
|De Groot    |de Groot    |
|De Guzman   |de Guzman   |
|De Smet     |de Smet     |
|Vos         |vos         |
|De La Cruz  |de la Cruz  |
|De Vos      |de vos      |
|De Wit      |de Wit      |
|Cela        |cela        |
|De Clercq   |de Clercq   |
|D9F2J       |D9f2j       |
|De Luca     |de Luca     |
|De Leon     |de Leon     |
|De Backer   |de Backer   |
|De Graaf    |de Graaf    |
|Van De Velde|Van de Velde|
|Da Costa    |da Costa    |
+------------+------------+
only showing top 20 rows



In [30]:
df_test = df.select("prenom", "prenom_udfcap").filter(df.prenom != df.prenom_udfcap)
print(df_test.count())
df_test.show(truncate = False)

504
+------------+-------------+
|prenom      |prenom_udfcap|
+------------+-------------+
|Bas         |bas          |
|Meme        |meme         |
|Hou         |hou          |
|Hui         |hui          |
|Pu          |pu           |
|Jean-Pierre |Jean-pierre  |
|Sera        |sera         |
|Mari-Liis   |Mari-liis    |
|Jean De Dieu|Jean de Dieu |
|İBrahim    |İbrahim     |
|Jean-Paul   |Jean-paul    |
|Ceci        |ceci         |
|Jean-Marie  |Jean-marie   |
|İSmail     |İsmail      |
|Jh3A        |Jh3a         |
|Jean-Claude |Jean-claude  |
|Vu          |vu           |
|Ka Hou      |Ka hou       |
|Hue         |hue          |
|Anne-Marie  |Anne-marie   |
+------------+-------------+
only showing top 20 rows

