# Demonstration of DataScope

This notebook walks us through a simple scenario of using `ease.ml/datascope` to do importance-based data debugging on a simple machine learning workflow.

In [1]:
# Essential imports.
import copy
import numpy as np
import pandas as pd
from datascope.importance.common import SklearnModelUtility
from datascope.importance.shapley import ShapleyImportance, ImportanceMethod
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder

## Data Loading and Preparation

In [2]:
# Load the UCI datasets.
data = fetch_openml(data_id=1590, as_frame=True)
data["frame"]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
48838,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
48839,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
48840,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K


In [3]:
# Perform basic scaling and encoding.
categorical_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
numerical_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
X_frame, y_frame = data["data"].copy(), data["target"].copy()
for col in categorical_cols:
    X_frame[col] = pd.Series(OrdinalEncoder().fit_transform(X_frame[[col]]).flatten())
# for col in numerical_cols:
#     X_frame[col] = pd.Series(StandardScaler().fit_transform(X_frame[[col]]).flatten())
y_frame = pd.Series(LabelEncoder().fit_transform(y_frame).flatten())
X, y = np.nan_to_num(X_frame.to_numpy()), y_frame.to_numpy()

In [4]:
# Split the dataset into train and test sets.
N_TRAIN, N_TEST = 1000, 200
SEED = 13
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=N_TRAIN, test_size=N_TEST, random_state=SEED)

In [5]:
# To emulate a dirty dataset we explicitly apply a corruption to 50% of the labels in the training dataset.
X_train_dirty, y_train_dirty = copy.deepcopy(X_train), copy.deepcopy(y_train)
dirty_idx = np.random.RandomState(seed=SEED).choice(a=[False, True], size=(N_TRAIN))
y_train_dirty[dirty_idx] = 1 - y_train[dirty_idx]

## Run an ML Pipeline and Evaluate

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver="liblinear", n_jobs=None, max_iter=5000, random_state=666)
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),
    ("model", model),
])

pipeline.fit(X_train_dirty, y_train_dirty)
y_pred = pipeline.predict(X_test)
accuracy_dirty = accuracy_score(y_test, y_pred)
print("Pipeline accuracy in the beginning:", accuracy_dirty)

Pipeline accuracy in the beginning: 0.565


## Compute Shapley-based Importance

In [7]:
from datascope.importance.common import SklearnModelAccuracy
from datascope.importance.shapley import ShapleyImportance, ImportanceMethod

utility = SklearnModelAccuracy(pipeline)
importance = ShapleyImportance(method=ImportanceMethod.NEIGHBOR, utility=utility)
importances = importance.fit(X_train_dirty, y_train_dirty).score(X_test, y_test)

## Fix 10% Most Important Data Examples

In [8]:
target_idx = np.argsort(importances)[:int(0.1 * N_TRAIN)]
X_train_repaired, y_train_repaired = np.copy(X_train_dirty), np.copy(y_train_dirty)
y_train_repaired[target_idx] = y_train[target_idx]

## Run the Pipeline on Repaired Data and Evaluate

In [9]:
pipeline.fit(X_train_repaired, y_train_repaired)
y_pred = pipeline.predict(X_test)
accuracy_repaired = accuracy_score(y_test, y_pred)
print("Pipeline accuracy after repairing:", accuracy_repaired)

Pipeline accuracy after repairing: 0.775
