In [70]:
import pandas as pd
import numpy as np
import zipfile, requests, io

from evidently import ColumnMapping
from evidently.report import Report
from evidently.test_suite import TestSuite

from evidently.metric_preset import TextOverviewPreset

from evidently.metrics import TextDescriptorsDriftMetric
from evidently.metrics import TextDescriptorsDistribution
from evidently.metrics import TextDescriptorsCorrelationMetric
from evidently.metrics import ColumnDriftMetric

from evidently.descriptors import TextLength, TriggerWordsPresence, OOV, NonLetterCharacterPercentage, SentenceCount, WordCount, Sentiment, RegExp

from evidently.features.text_length_feature import TextLength
from evidently.features.OOV_words_percentage_feature import OOVWordsPercentage

# Prepare legal acts dataset

In [60]:
legal_acts_content = requests.get('https://drive.usercontent.google.com/u/0/uc?id=11xy6a6cbteD2TG9r3CU16ysO50xrqZf2&export=download').content

In [61]:
with zipfile.ZipFile(io.BytesIO(legal_acts_content)) as arc:
    legal_acts_data = pd.read_json(arc.open("data.json"))
legal_acts_data['labels'] = legal_acts_data['labels'].apply(lambda x: ','.join(map(str, x)))
legal_acts_data['year'] = legal_acts_data['id'].str.slice(5, 9).astype(int)

## Split the two datasets to compare

In [62]:
reference = legal_acts_data[legal_acts_data['year'] < 2000]
current = legal_acts_data[legal_acts_data['year'] > 2000]

# prepare data and map schema
column_mapping_legal = ColumnMapping()
column_mapping_legal.target = "labels"
#column_mapping.predictions = "predicted_labels"
column_mapping_legal.text_features = ['text']
column_mapping_legal.categorical_features = []
column_mapping_legal.numerical_features = []

# Text Metrics

In [63]:
#NO descriptors
text_specific_metrics_report = Report(metrics=[
    TextDescriptorsDriftMetric(column_name="text"),
    TextDescriptorsDistribution(column_name="text"),
    TextDescriptorsCorrelationMetric(column_name="text"),
])

text_specific_metrics_report.run(reference_data=reference, current_data=current, column_mapping=column_mapping_legal)
text_specific_metrics_report.save_html("reports/text-metrics-legal_acts.html")

In [64]:
#WITH descriptors
report = Report(metrics=[
    TextDescriptorsDriftMetric("text", descriptors={
        "Legal Acts Length": TextLength(),
        "Legal acts about governance" : TriggerWordsPresence(words_list=['governance']),
        "Legal acts about sport" : TriggerWordsPresence(words_list=['sport', 'sports']),
        "Legal Acts Sentence Count": SentenceCount(),
        "Legal Acts Word Count": WordCount(),
        "Legal Acts Sentiment": Sentiment(),
        "Legal Acts questions": RegExp(reg_exp=r'.*\?.*'),
        "Legal Acts OOV": OOV(),
        "Legal Acts Non Letter %": NonLetterCharacterPercentage()
    })
])

report.run(reference_data=reference, current_data=current, column_mapping=column_mapping_legal)
report.save_html("reports/text_desc_drift_metrics.html")

# Text Overview Preset

In [65]:
#NO descriptors
text_overview_report_legal = Report(metrics=[
    TextOverviewPreset(column_name='text')
])
text_overview_report_legal.run(reference_data=reference, current_data=current, column_mapping=column_mapping_legal)
report.save_html("text_overview_report.html")

In [66]:
#WITH descriptors
text_overview_report_legal = Report(metrics=[
    TextOverviewPreset(column_name="text", descriptors={
        "Legal Act OOV": OOV(),
        "Legat Act Non Letter %" : NonLetterCharacterPercentage(),
        "Legal Act Length": TextLength(),
        "Legal Acts about Sport" : TriggerWordsPresence(words_list=['sport']),
        "Legal Act Sentence Count": SentenceCount(),
        "Legal Act Word Count": WordCount(),
        "Legal Act Sentiment": Sentiment(),
        "Legal Act questions": RegExp(reg_exp=r'.*\?.*'),
    })
])

text_overview_report_legal.run(reference_data=reference, current_data=current, column_mapping=column_mapping_legal)
text_overview_report_legal.save_html("reports/text_overview_descriptors.html")

# Column Drift Metric for Text Descriptors

In [68]:
drift_report_legal = Report(metrics=[
    ColumnDriftMetric(column_name=TextLength(display_name="TextLength").for_column("text")),
    ColumnDriftMetric(column_name=TriggerWordsPresence(words_list=['immigrazione'], display_name="Legal Acts about immigrazione").for_column("text")),

])

drift_report_legal.run(reference_data=reference, current_data=current, column_mapping=column_mapping_legal)
drift_report_legal.save_html('reports/text_drift_report_legal.html')

In [71]:
text_feature = TextLength(column_name='text').generate_feature(data=current, data_definition=None)
oov_feature = OOVWordsPercentage(column_name='text').generate_feature(data=current, data_definition=None)

In [72]:
current['text_length'] = text_feature.values
current['oov_share'] = oov_feature.values

In [77]:
current[current['oov_share'] > 80]

Unnamed: 0,text,id,labels,year,text_length,oov_share
5,Ratifica ed esecuzione dell'Accordo tra il Gov...,ipzs-20210525_21G00083,A1810,2021,203,90.322581
11,Ratifica ed esecuzione dell'Accordo fra il Gov...,ipzs-20210519_21G00075,A1810,2021,204,89.655172
15,Ratifica ed esecuzione dell'Accordo di coopera...,ipzs-20210517_21G00073,A1810,2021,205,86.666667
24,Regolamento recante attuazione dell'articolo 1...,ipzs-20210423_21G00060,A3565,2021,182,84.000000
27,Regolamento recante modifiche al decreto minis...,ipzs-20210421_21G00061,A4320,2021,161,81.818182
...,...,...,...,...,...,...
5106,Ratifica ed esecuzione dell'Accordo tra il Gov...,ipzs-20010112_001G0007,A1810,2001,236,81.081081
5108,Regolamento recante disposizioni sulla composi...,ipzs-20010111_001G0003,A5170,2001,253,82.857143
5112,Approvazione del nuovo regolamento di esecuzio...,ipzs-20010108_000G0447,A6040,2001,136,88.888889
5114,Norme sull'organizzazione e sul personale del ...,ipzs-20010108_001G0004,A3460,2001,65,87.500000
