# Setup

In [1]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import LightPipeline
import pyspark.sql.functions as F
from pyspark.sql import Row

import os
import json
import numpy as ny
import pandas as pd
import dataprofiler as dp
import matplotlib.pyplot as plt
path="data/data_set_fusion.csv"

In [2]:
spark = sparknlp.start()
print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.3.2
Apache Spark version: 3.1.2


### réussi de télécharger modèle en local sans problème

In [None]:
pipeline = PretrainedPipeline('entity_recognizer_md', lang = 'fr')
#pipeline = PretrainedPipeline.from_disk('entity_recognizer_md_fr')

entity_recognizer_lg download started this may take some time.


# Spark NLP

In [None]:
# transfer list of Row into list of True (detected >= 2/3) and False (non detected)
def getResultSNLP(list_ner, entity):
    result_snlp = []
    for ner in list_ner:
        cpt = 0
        for i in range(len(ner.ner)):
            if entity in ner.ner[i]:
                cpt += 1
        result_snlp.append(cpt/len(ner.ner) >= 2/3)
    return result_snlp

In [None]:
def runSparkNLP(data, entity):
    annotations = pipeline.transform(data)
    # annotations.show(n=30, truncate=False)
    list_ner = annotations.selectExpr("ner.result AS ner").collect() # list of Row (each Row is list of entity detected)
    return getResultSNLP(list_ner, entity)

# Data Profiler

In [None]:
def runDataProfiler(data, entity):
    labeler = dp.DataLabeler(labeler_type='structured')
    predictions = labeler.predict(data)
    result_dp = [p == entity for p in predictions['pred']]
    return result_dp

# Comparaison 
### cas une colonnes contient un type d'information unique - PER, LOC, etc <=> on n'a que Positive, aucun Negative

In [None]:
def comparaison(result_snlp, result_dp):
    result_list = {'both': 0, 'snlp': 0, 'dp': 0, 'none': 0}
    for i in range(len(result_snlp)):
        if result_snlp[i]:
            if result_dp[i]: result_list['both'] += 1
            else: result_list['snlp'] += 1
        else:
            if result_dp[i]: result_list['dp'] += 1
            else: result_list['none'] += 1

    print(result_list)
    plt.pie(list(result_list.values()), labels = list(result_list.keys()))
    plt.legend(title = "Detection Rate")
    plt.show()

    recall_snlp = result_snlp.count(True) / len(result_snlp)
    recall_dp = result_dp.count(True) / len(result_dp)
    print("Recall Spark NLP:", recall_snlp)
    print("Recall Data Profiler:", recall_dp)
    recall = [recall_snlp, recall_dp]
    plt.bar(["Spark NLP", "Data Profiler"], recall)
    plt.ylabel('Recall')
    plt.show()

In [None]:
def getResult(column, entity_snlp, entity_dp):
    df_spark = spark.read.format("csv").option("header","true").option("inferSchema","true").load(path)
    data = df_spark.select(column).toDF("text")
    result_snlp = runSparkNLP(data, entity_snlp)

    data = dp.Data(path, options=dict(selected_columns=[column]))
    result_dp =  runDataProfiler(data, entity_dp)
    return result_snlp, result_dp

## nom

In [None]:
result_snlp, result_dp = getResult('nom', 'PER', 'PERSON')

In [None]:
comparaison(result_snlp, result_dp)

## adresse

In [None]:
df_spark = spark.read.format("csv").option("header","true").option("inferSchema","true").load(path)
annotations = pipeline.transform(df_spark.select('adresse').toDF("text"))
annotations = annotations.selectExpr("token.result AS token", "ner.result AS ner")
annotations.show(truncate=False)

In [None]:
result_snlp, result_dp = getResult('adresse', 'LOC', 'ADDRESSE')

In [None]:
comparaison(result_snlp, result_dp)

# Cas mélange 2 colonnes



In [None]:
import random

n = 20000 # prendre n lignes chacun

df = pd.read_csv(path)
fusion_data = pd.DataFrame()
fusion_data['data'] = pd.concat([df['nom'][:n], df['adresse'][:n]])
fusion_data['class'] = ['PER']*n + ['LOC']*n
fusion_data = fusion_data.sample(frac=1)

from pyspark.sql.types import StringType
fusion_data_snlp = spark.createDataFrame(fusion_data['data'].tolist(), StringType()).toDF('text')
fusion_data

In [None]:
annotations = pipeline.transform(fusion_data_snlp)
annotations = annotations.selectExpr("token.result AS token", "ner.result AS ner")
annotations.show(truncate=False)

In [None]:
def countResult(pred, act):
    l = len(act)
    tp = [act[i] == pred[i] == True for i in range(l)]
    fp = [act[i] == False and pred[i] == True for i in range(l)]
    tn = [act[i] == pred[i] == False for i in range(l)]
    fn = [act[i] == True and pred[i] == False for i in range(l)]
    result_count = [tp.count(True), fp.count(True), tn.count(True), fn.count(True)]
    print("TP: ", result_count[0], " // FP: ", result_count[1], " // TN: ", result_count[2], " // FN: ", result_count[3])
    return result_count

In [None]:
def comparaison2(result_snlp, result_dp, actual_class):
    print("Count result of SNLP:")
    count_snlp = countResult(result_snlp, actual_class)
    print("Count result of DP:")
    count_dp = countResult(result_dp, actual_class)
    
    #Recall = TP/(TP+ FN)
    recall_snlp = count_snlp[0] / (count_snlp[0] + count_snlp[3]) if (count_snlp[0] + count_snlp[3]) != 0 else 0
    recall_dp = count_dp[0] / (count_dp[0] + count_dp[3]) if (count_dp[0] + count_dp[3]) != 0 else 0
    print("Recall Spark NLP:", recall_snlp)
    print("Recall Data Profiler:", recall_dp)
    recall = [recall_snlp, recall_dp]
    plt.bar(["Spark NLP", "Data Profiler"], recall)
    plt.ylabel('Recall')
    plt.show()
    
    #Precision = TP/(TP + FP)
    prec_snlp = count_snlp[0] / (count_snlp[0] + count_snlp[1]) if (count_snlp[0] + count_snlp[1]) != 0 else 0
    prec_dp = count_dp[0] / (count_dp[0] + count_dp[1]) if (count_dp[0] + count_dp[1]) != 0 else 0
    print("Precision Spark NLP:", prec_snlp)
    print("Precision Data Profiler:", prec_dp)
    precision = [prec_snlp, prec_dp]
    plt.bar(["Spark NLP", "Data Profiler"], precision)
    plt.ylabel('Precision')
    plt.show()

## nom

In [None]:
result_snlp = runSparkNLP(fusion_data_snlp, 'PER')
result_dp =  runDataProfiler(fusion_data['data'], 'PERSON')
actual_class = [d == 'PER' for d in fusion_data['class']]
comparaison2(result_snlp, result_dp, actual_class)

## adresse

In [None]:
result_snlp = runSparkNLP(fusion_data_snlp, 'LOC')
result_dp =  runDataProfiler(fusion_data['data'], 'ADDRESSE')
actual_class = [d == 'LOC' for d in fusion_data['class']]
comparaison2(result_snlp, result_dp, actual_class)

# Spark NLP + Data Profiler

In [None]:
def get_structured_results(results):
    columns = []
    predictions = []
    for col_report in results['data_stats']:
        columns.append(col_report['column_name'])
        predictions.append(col_report['data_label'])
        
    df_results = pd.DataFrame({'Column' : columns, 'Pred DP' : predictions})
    return df_results

In [None]:
data = dp.Data(path)
df_data = data.data
df_data.head()

In [None]:
profiler = dp.Profiler(data)
results = profiler.report(report_options={'output_format':'compact'})

In [None]:
df_result = get_structured_results(results)
print(df_result)

### ne fait que Spark NLP sur les cols UNKNOWN ou sur tout?

In [None]:
unknown_cols = []
for index, row in df_result.iterrows():
    if row['Pred DP'] == 'UNKNOWN':
        unknown_cols.append(row['Column'])
print(unknown_cols)

In [None]:
for i in df_result['Pred DP']:
    print(i)

In [None]:
def getResult2(column):
    df_spark = spark.read.format("csv").option("header","true").option("inferSchema","true").load(path)
    data = df_spark.select(column).toDF("text")
    annotations = pipeline.transform(data)
    list_ner = annotations.selectExpr("ner.result AS ner").collect()
    return getResultSNLP2(list_ner)

In [None]:
def getResultSNLP2(list_ner):
    result_snlp = []
    for ner in list_ner:
        count_entity = {'PER': 0, 'LOC': 0, 'MISC': 0, 'ORG': 0}
        for i in range(len(ner.ner)):
            for key in count_entity.keys():
                if key in ner.ner[i]:
                    count_entity[key] += 1
        max_key = max(count_entity, key = count_entity.get)
        if count_entity[max_key]/len(ner.ner) < 2/3:
            max_key = "UNKNOWN"
        result_snlp.append(max_key)
    return max(result_snlp, key = result_snlp.count)

In [None]:
#df_result['Pred SNLP'] = ['UNKNOWN']*df_result.shape[0]
#df_result

In [None]:
for col in unknown_cols:
    df_result.at[df_result['Column'] == col, 'Pred DP'] = getResult2(col)
print(df_result)

In [None]:
try:
    sparknlp.stop()
except:
    pass