In [1]:
import pandas as pd
import sparknlp
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.pretrained import PretrainedPipeline
spark = sparknlp.start()
pipeline = PretrainedPipeline.from_disk('entity_recognizer_lg_fr')

In [2]:
data = pd.read_csv("data/wikiner_dataset", sep='/n', header=None, index_col = False)
data.shape

  return func(*args, **kwargs)


(9381, 1)

# Unstructed text

In [3]:
phrases, ners = [], []
for d in data[0]:
    text = d.split()
    p, n = [], []
    for t in text:    
        word = t.split("|")
        p.append(word[0])
        n.append(word[2])
    phrases.append(' '.join(p))
    ners.append(n)
print(len(ners))

9381


In [4]:
phrases

["Il assure à la suite de Saussure le cours de grammaire comparée , qu' il complète à partir de 1894 par une conférence sur l' iranien .",
 "En 1905 , il occupe la chaire de grammaire comparée au Collège de France , où il consacre ses cours à l' histoire et à la structure des langues indo-européennes .",
 'Il a formé toute une génération de linguistes français , parmi lesquels Emile Benveniste , Marcel Cohen , Georges Dumézil , André Martinet , Aurélien Sauvageot , Lucien Tesnière , Joseph Vendryes .',
 "Il devait diriger la thèse de Jean Paulhan sur la sémantique du proverbe et c' est lui qui découvrit Gustave Guillaume .",
 "Il est notamment l' inspirateur de la définition de la phrase adoptée par le linguiste américain Leonard Bloomfield .",
 "En 1902 , il obtient la chaire d' arménien de l' École des langues orientales .",
 'Son étudiant , Hratchia Adjarian , deviendra le fondateur de la dialectologie arménienne .',
 "Ainsi il le dirige vers l' étude de l' oralité dans son cadre na

In [5]:
ners

[['O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'I-PER',
  'I-PER',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'I-PER',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
 

In [6]:
from pyspark.sql.types import StringType
df = spark.createDataFrame(phrases, StringType()).toDF("text")
annotations = pipeline.transform(df)
result = annotations.select("ner.result")
result = result.collect()
result

[Row(result=['O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
 Row(result=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
 Row(result=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'I-PER', 'O']),
 Row(result=['O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O']),
 Row(result=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O']),
 Row(result=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']),
 Row(result=['O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O'

In [7]:
def comparaison(entity):
    pos = [entity in ners[i][j] for i in range(len(ners)) for j in range(len(ners[i]))]
    tp = [entity in ners[i][j] and entity in result[i].result[j] for i in range(len(ners)) for j in range(len(ners[i]))]
    fp = [entity not in ners[i][j] and entity in result[i].result[j]  for i in range(len(ners)) for j in range(len(ners[i]))]
    fp.count(True)
    print("Recall =", tp.count(True)/ pos.count(True))
    print("Precision =", tp.count(True)/(tp.count(True) + fp.count(True)))

In [8]:
print("PERSON")
comparaison('PER') ### I-PER, B-PER

PERSON
Recall = 0.9905085540192172
Precision = 0.9905085540192172


In [9]:
print("LOCATION")
comparaison('LOC') ## I-LOC, B-LOC

LOCATION
Recall = 0.939084649655731
Precision = 0.9672924488944514


# Structured text (get from unstructured text)

In [10]:
def get_entity(entity):
    entities = []
    for i in range(len(ners)):
        start, end = -1, -1
        for j in range(len(ners[i])):
            if entity in ners[i][j]:
                if start == -1:
                    start = j
                else:
                    end = j
            else:
                if(start != -1 and end != -1):
                    entities.append(' '.join(phrases[i].split()[start:end+1]))
                start, end = -1, -1
    return entities

In [11]:
persons = get_entity("PER")
persons

['de Saussure',
 'Emile Benveniste',
 'Marcel Cohen',
 'Georges Dumézil',
 'André Martinet',
 'Aurélien Sauvageot',
 'Lucien Tesnière',
 'Joseph Vendryes',
 'Jean Paulhan',
 'Gustave Guillaume',
 'Leonard Bloomfield',
 'Hratchia Adjarian',
 'René Descartes',
 "Jean d' Alembert",
 'Carl Friedrich Gauss',
 'Marie Ennemond Camille Jordan',
 'William Rowan Hamilton',
 'Georg Frobenius',
 'Richard Dedekind',
 'William Burnside',
 'Richard Brauer',
 'Al Khuwarizmi',
 'Ada Lovelace',
 'Lord Byron',
 'Charles Babbage',
 'Carlos Menem',
 'Néstor Kirchner',
 'Eduardo Duhalde',
 'Carlos Menem',
 'Nestor Kirchner',
 'Heinz Fischer',
 'Benita Ferrero-Waldner',
 'Louis-Philippe I er',
 'Louis-Robert Goust',
 'Héricart de Thury',
 'tombe du Soldat inconnu',
 'Arsène Lupin',
 'Maurice Leblanc',
 'Arsène Lupin',
 'Arsène Lupin',
 'André-François Ruaud',
 'Maurice Leblanc',
 'Arsène Lupin',
 'Maurice Leblanc',
 'Arsène Lupin',
 'Jacques Dutronc',
 'Georges Descrières',
 'Maurice Leblanc',
 'Arsène Lupin

In [12]:
locs = get_entity("LOC")
locs

['Collège de France',
 'avenue de la Grande-Armée',
 'avenue de Wagram',
 'avenue des Champs-Élysées',
 "Arc de triomphe de l' Étoile",
 'Arc de Triomphe',
 'la France',
 'la France',
 'Afrique du Nord',
 'Asie centrale',
 'Empire perse',
 "mer d' Oman",
 "mer d' Aral",
 'Taj Mahal',
 'Asie du Sud',
 'Empire russe',
 'Égypte antique',
 'province de Ghazni',
 'statues de Bouddha',
 'province de Bâmiyân',
 'Massif central',
 'la France',
 'Clermont Ferrand',
 'Massif central',
 "région d' Auvergne",
 'Massif central',
 'Chaîne des Puys',
 'Puy de Sancy',
 'Monts Dore',
 'Aéroport de Clermont-Ferrand Auvergne',
 "Parc naturel régional des volcans d' Auvergne",
 'Puy Mary',
 'Plomb du Cantal',
 'Super Lioran',
 'Le Mont-Dore',
 "l' Allier",
 'Clermont Ferrand',
 'La Bourboule',
 "région d' Auvergne",
 'chaîne des Puys',
 "région d' Auvergne",
 "communauté d' agglomération de Saint-Quentin",
 "communauté d' agglomération du Soissonnais",
 'canal de Saint-Quentin',
 "communes de l' Aisne",
 

In [13]:
fusion_data = pd.DataFrame()
fusion_data['data'] = persons + locs
fusion_data['class'] = ['PER']*len(persons) + ['LOC']*len(locs)
fusion_data = fusion_data.sample(frac=1)
fusion_data_snlp = spark.createDataFrame(fusion_data['data'].tolist(), StringType()).toDF('text')
fusion_data_snlp.show()

+--------------------+
|                text|
+--------------------+
|       Louis Pasteur|
|       Pablo Picasso|
|     Philippe Breton|
|     Maurice Leblanc|
|    atoll des Roches|
|          L' Arménie|
|      Paul McCartney|
|     Arabie saoudite|
|département de l'...|
|         Noël Mamère|
|          Mel Brooks|
|       Napoléon I er|
|Région de Bruxell...|
|        Carlos Menem|
|     Afrique du Nord|
|      Flavius Aétius|
|       Amenhotep III|
|        Buenos Aires|
|Jean-Pierre Stirbois|
|   Daniel Rutherford|
+--------------------+
only showing top 20 rows



In [14]:
def runSparkNLP(data, entity):
    annotations = pipeline.transform(data)
    list_ner = annotations.selectExpr("ner.result AS ner").collect()
    return getResultSNLP(list_ner, entity)

In [15]:
# transfer list of Row into list of True (detected >= 2/3) and False (non detected)
def getResultSNLP(list_ner, entity):
    result_snlp = []
    for ner in list_ner:
        cpt = 0
        for i in range(len(ner.ner)):
            if entity in ner.ner[i]:
                cpt += 1
        result_snlp.append(cpt/len(ner.ner) >= 2/3)
    return result_snlp

In [16]:
def countResult(pred, act):
    l = len(act)
    tp = [act[i] == pred[i] == True for i in range(l)]
    fp = [act[i] == False and pred[i] == True for i in range(l)]
    tn = [act[i] == pred[i] == False for i in range(l)]
    fn = [act[i] == True and pred[i] == False for i in range(l)]
    result_count = [tp.count(True), fp.count(True), tn.count(True), fn.count(True)]
    print("TP: ", result_count[0], " // FP: ", result_count[1], " // TN: ", result_count[2], " // FN: ", result_count[3])
    return result_count

In [17]:
def comparaison2(result_snlp, actual_class):
    print("Count result of SNLP:")
    count_snlp = countResult(result_snlp, actual_class)
    
    #Recall = TP/(TP+ FN)
    recall_snlp = count_snlp[0] / (count_snlp[0] + count_snlp[3]) if (count_snlp[0] + count_snlp[3]) != 0 else 0
    print("Recall:", recall_snlp)
    
    #Precision = TP/(TP + FP)
    prec_snlp = count_snlp[0] / (count_snlp[0] + count_snlp[1]) if (count_snlp[0] + count_snlp[1]) != 0 else 0
    print("Precision:", prec_snlp)

In [18]:
result_snlp = runSparkNLP(fusion_data_snlp, 'PER')
actual_class = [d == 'PER' for d in fusion_data['class']]
comparaison2(result_snlp, actual_class)

Count result of SNLP:
TP:  2883  // FP:  43  // TN:  2133  // FN:  109
Recall: 0.9635695187165776
Precision: 0.9853041695146958


In [19]:
result_snlp = runSparkNLP(fusion_data_snlp, 'LOC')
actual_class = [d == 'LOC' for d in fusion_data['class']]
comparaison2(result_snlp, actual_class)

Count result of SNLP:
TP:  1338  // FP:  4  // TN:  2988  // FN:  838
Recall: 0.6148897058823529
Precision: 0.9970193740685543
