In [29]:
import pandas as pd
import json, requests, zipfile, io
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
from evidently.metrics import TextDescriptorsDriftMetric

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset
from evidently.tests import *

from evidently.features.text_length_feature import TextLength
from evidently.features.OOV_words_percentage_feature import OOVWordsPercentage

# Prepare legal acts dataset

In [30]:
legal_acts_content = requests.get('https://drive.usercontent.google.com/u/0/uc?id=11xy6a6cbteD2TG9r3CU16ysO50xrqZf2&export=download').content

In [31]:
with zipfile.ZipFile(io.BytesIO(legal_acts_content)) as arc:
    legal_acts_data = pd.read_json(arc.open("data.json"))
legal_acts_data['labels'] = legal_acts_data['labels'].apply(lambda x: ','.join(map(str, x)))
legal_acts_data['year'] = legal_acts_data['id'].str.slice(5, 9).astype(int)

In [32]:
reference = legal_acts_data[legal_acts_data['year'] < 2000]
current = legal_acts_data[legal_acts_data['year'] > 2000]

# prepare data and map schema
column_mapping_legal = ColumnMapping()
column_mapping_legal.target = "labels"
#column_mapping.predictions = "predicted_labels"
column_mapping_legal.text_features = ['text']
column_mapping_legal.categorical_features = []
column_mapping_legal.numerical_features = []

In [33]:

def build_data_drift_report(
    reference_data: pd.DataFrame,
    current_data: pd.DataFrame,
    column_mapping: ColumnMapping,
    drift_share=0.4,
) -> Report:
    """
    Returns a list with pairs (feature_name, drift_score)
    Drift Score depends on the selected statistical test or distance and the threshold
    """
    data_drift_report = Report(metrics=[DataDriftPreset(drift_share=drift_share)])
    data_drift_report.run(
        reference_data=reference_data,
        current_data=current_data,
        column_mapping=column_mapping,
    )
    return data_drift_report

In [35]:
report = build_data_drift_report(reference, current, column_mapping_legal)
report.save_html("reports/data_drift.html")
#print(report.json())

# Test suites

In [42]:
tests = TestSuite(tests=[
    TestNumberOfColumnsWithMissingValues(),
    TestNumberOfRowsWithMissingValues(),
    TestNumberOfConstantColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns(),
    TestColumnsType(),
    TestNumberOfDriftedColumns(),
])

tests.run(reference_data=reference, current_data=current)
tests_result = tests.as_dict()
tests_result

{'tests': [{'name': 'The Number of Columns With Missing Values',
   'description': 'The number of columns with missing values is 0. The test threshold is lte=0.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'lte': 0}, 'value': 0.0}},
  {'name': 'The Number Of Rows With Missing Values',
   'description': 'The number of rows with missing values is 0. The test threshold is lte=0 ± 1e-12.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'lte': {'value': 0.0,
      'relative': 0.1,
      'absolute': 1e-12}},
    'value': 0.0}},
  {'name': 'Number of Constant Columns',
   'description': 'The number of constant columns is 0. The test threshold is lte=0.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'lte': 0}, 'value': 0.0}},
  {'name': 'Number of Duplicate Rows',
   'description': 'The number of duplicate rows is 0. The test threshold is eq=0 ± 1e-12.',
   'status': 'SUCC

In [45]:
tests_result['summary']

{'all_passed': False,
 'total_tests': 7,
 'success_tests': 6,
 'failed_tests': 1,
 'by_status': {'SUCCESS': 6, 'FAIL': 1}}