# CleanLab
cleanlab automatically detects data and label issues in ML datasets.

In [78]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import joblib
import ast
import json

from cleanlab.datalab.datalab import Datalab
from cleanlab.classification import CleanLearning

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
## consider adding visualization of the data here

In [2]:
df = pd.read_csv("../data/cleanlab_demo3.csv")
df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_85,feature_86,feature_87,feature_88,feature_89,feature_90,feature_91,feature_92,feature_93,label
0,0.046525,0.070970,0.059846,0.303777,0.105798,0.247639,0.305501,0.332181,0.207858,0.111704,...,0.45,0.70,0.261911,0.680,1.00,3.05,0.030758,0.268511,14.25,2.0
1,0.045960,0.071916,0.060067,0.304967,0.107050,0.251919,0.309712,0.335264,0.207988,0.111795,...,0.65,0.65,0.262012,0.695,0.85,4.05,0.480713,0.201347,23.95,2.0
2,0.046250,0.070420,0.059739,0.289906,0.106706,0.239780,0.289128,0.318570,0.206722,0.111093,...,0.70,0.75,0.415425,0.645,0.95,4.10,0.578118,0.192824,24.90,2.0
3,0.043542,0.065530,0.055383,0.267246,0.101976,0.225200,0.270993,0.302373,0.197826,0.105371,...,0.80,0.80,0.348029,0.620,1.00,6.00,0.005655,0.066262,48.40,2.0
4,0.043664,0.066476,0.053536,0.289464,0.100282,0.236721,0.288930,0.322209,0.194232,0.101556,...,1.00,0.80,0.000000,0.620,1.00,5.65,0.141905,0.092988,45.85,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194231,0.052728,0.084001,0.074922,0.262089,0.132914,0.227123,0.256580,0.277058,0.250790,0.172869,...,2.00,1.15,0.716885,0.505,1.95,0.65,0.867164,0.705000,0.95,2.0
194232,0.055650,0.088235,0.079782,0.263119,0.131327,0.229335,0.259770,0.279988,0.246128,0.169314,...,2.20,1.25,0.543431,0.475,2.25,0.70,0.828825,0.680000,1.00,0.0
194233,0.053246,0.082940,0.076005,0.263264,0.121241,0.229419,0.263981,0.286976,0.234096,0.155711,...,1.70,1.20,0.700861,0.480,2.00,0.95,0.565562,0.637941,2.15,2.0
194234,0.049645,0.077111,0.069337,0.265660,0.114656,0.231212,0.267659,0.292912,0.225048,0.144999,...,1.35,1.05,0.800739,0.545,1.75,1.05,0.702127,0.597941,2.35,2.0


## Find common issues in data
Use cleanlab to detect issues in the dataset (label errors, outliers, near duplicates). An experiment is performed with a Logistic Regression model as a gut check. Then the full assessment is performed with the current plantations model.

### Test with general classifier (gut check)

In [10]:
# generate probabilities
feats = df.loc[:, df.columns!='label']
model = LogisticRegression()
pred_probs = cross_val_predict(estimator=model, 
                               X=feats, 
                               y=df.label, 
                               cv=5, 
                               method="predict_proba")

lab = Datalab(data=df, label_name="label")
lab.find_issues(features=feats, pred_probs=pred_probs)
lab.report()

Finding null issues ...
Finding label issues ...
Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...





Audit complete. 76728 issues found in the dataset.
Dataset Information: num_examples: 194236, num_classes: 4

Here is a summary of various issues found in your data:

           issue_type  num_issues
                label       47909
       near_duplicate       24670
              outlier        4070
underperforming_group          78
              non_iid           1

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 47909
Overall dataset quality in terms of this issue: 0.7866

Examples representing mos

### Test with trained Catboost Classifier
Trained catboost classifier is loaded and features are filtered to the top 40 feats used in training.

In [16]:
cat = joblib.load("../models/model.joblib")
top_feats = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 20, 22, 24, 25, 32, 34, 42, 50, 52, 60, 64, 65, 70, 71, 73, 74, 75, 76, 77, 81, 85, 89, 90, 93]
feat_cols = ['feature_' + str(i) for i in top_feats]
feats = df.loc[:, df.columns.isin(feat_cols)]

In [17]:
# generate probabilities
pred_probs = cross_val_predict(estimator=cat, 
                               X=feats, 
                               y=df.label, 
                               cv=5, 
                               method="predict_proba")

lab = Datalab(data=df, label_name="label")
lab.find_issues(features=feats, pred_probs=pred_probs)
lab.report()

Finding null issues ...
Finding label issues ...
Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...





Audit complete. 81510 issues found in the dataset.
Dataset Information: num_examples: 194236, num_classes: 4

Here is a summary of various issues found in your data:

    issue_type  num_issues
         label       35324
near_duplicate       33413
       outlier       12772
       non_iid           1

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 35324
Overall dataset quality in terms of this issue: 0.8548

Examples representing most severe instances of this issue:
        is_label_issue  label_score

## Analysis of Results
Notes
- A numeric quality score (between 0 and 1) estimating how severe this issue is exhibited in each example from a dataset. Examples with higher scores are less likely to suffer from this issue.

In [19]:
n_samples = 991
plot_ids = np.repeat([f'plot_{i+1}' for i in range(n_samples)], 196)
df['plot_id'] = plot_ids
df.head()

### Label issues

In [43]:
label_assessment = lab.get_issues("label").sort_values("label_score")
dup_assessment = lab.get_issues("near_duplicate").sort_values("near_duplicate_score")
outlier_assessment = lab.get_issues("outlier").sort_values("outlier_score")
non_iid_assessment = lab.get_issues("non_iid").sort_values("non_iid_score")
for var_name in ["label_assessment", "dup_assessment", "outlier_assessment", "non_iid_assessment"]:
    globals()[var_name].to_csv(f"../data/cleanlab/{var_name}.csv", index=True)

In [50]:
label_issues = label_assessment[label_assessment.is_label_issue == True]
dup_issues = dup_assessment[dup_assessment.is_near_duplicate_issue == True]
outlier_issues = outlier_assessment[outlier_assessment.is_outlier_issue == True]
non_iid_issues = non_iid_assessment[non_iid_assessment.is_non_iid_issue == True]

In [26]:
# identify plots for the top label issues in the report
print(df.iloc[7079][['plot_id']])
print(df.iloc[128128][['plot_id']])
print(df.iloc[7080][['plot_id']])
print(df.iloc[7121][['plot_id']])
print(df.iloc[128161][['plot_id']])

plot_id    plot_37
Name: 7079, dtype: object
plot_id    plot_654
Name: 128128, dtype: object
plot_id    plot_37
Name: 7080, dtype: object
plot_id    plot_37
Name: 7121, dtype: object
plot_id    plot_654
Name: 128161, dtype: object


In [52]:
# map the plot id from the original df to label issues
label_issues['plot_id'] = label_issues.index.map(df.set_index(df.index)['plot_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_issues['plot_id'] = label_issues.index.map(df.set_index(df.index)['plot_id'])


In [61]:
counts = label_issues.plot_id.value_counts()
full_plots = counts[counts == 196]
partial_plots = counts[(counts >= 98) & (counts < 196)]
len(full_plots), len(partial_plots)

(26, 117)

In [57]:
full_plots

plot_id
plot_676    196
plot_697    196
plot_759    196
plot_709    196
plot_696    196
plot_757    196
plot_695    196
plot_731    196
plot_766    196
plot_739    196
plot_736    196
plot_734    196
plot_753    196
plot_698    196
plot_671    196
plot_310    196
plot_816    196
plot_738    196
plot_725    196
plot_699    196
plot_135    196
plot_755    196
plot_733    196
plot_678    196
plot_713    196
plot_708    196
Name: count, dtype: int64

In [62]:
partial_plots

plot_id
plot_675    195
plot_751    195
plot_720    195
plot_741    195
plot_703    195
           ... 
plot_146    105
plot_125    105
plot_346    104
plot_367    100
plot_152     98
Name: count, Length: 117, dtype: int64

### Duplication Issues

In [64]:
dup_issues['plot_id'] = dup_issues.index.map(df.set_index(df.index)['plot_id'])
dup_issues.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_issues['plot_id'] = dup_issues.index.map(df.set_index(df.index)['plot_id'])


Unnamed: 0,is_near_duplicate_issue,near_duplicate_score,near_duplicate_sets,distance_to_nearest_neighbor,plot_id
60532,True,0.000152,"[60531, 60533, 60523, 60510, 60537, 60545, 605...",2.1213e-07,plot_309
60531,True,0.000152,"[60532, 60510, 60523, 60505, 60533, 60536, 605...",2.1213e-07,plot_309
60522,True,0.000236,"[60536, 60505, 60537, 60491, 60523, 60519, 605...",3.297314e-07,plot_309
60536,True,0.000236,"[60522, 60537, 60491, 60523, 60550, 60519, 605...",3.297314e-07,plot_309
60537,True,0.000279,"[60523, 60536, 60491, 60533, 60519, 60522, 605...",3.896179e-07,plot_309


In [77]:
# create intersection of label and duplicate issues, count plots
dup_label_comb = pd.merge(dup_issues, label_issues, left_index=True, right_index=True, how='inner')
counts_dl = dup_label_comb.plot_id_x.value_counts()
partial_plots_dl = counts_dl[(counts >= 150) & (counts < 196)]
len(partial_plots_dl)

39

Conclusions
- There are no full plots (196 pixels) flagged with label AND duplicate issues. There are 76 plots with between 98 - 196 pixels flagged. There are 39 plots with 150-196 pixels flagged.

### Outlier Issues

In [27]:
# identify plots for the top outliers in the report
print(df.iloc[127289][['plot_id']])
print(df.iloc[26536][['plot_id']])
print(df.iloc[26537][['plot_id']])
print(df.iloc[11968][['plot_id']])
print(df.iloc[127288][['plot_id']])

plot_id    plot_650
Name: 127289, dtype: object
plot_id    plot_136
Name: 26536, dtype: object
plot_id    plot_136
Name: 26537, dtype: object
plot_id    plot_62
Name: 11968, dtype: object
plot_id    plot_650
Name: 127288, dtype: object


In [None]:
## Figure out appropriate mapping to CEO survey

In [80]:
# plot ids are dropped if the labels are unknown
with open("../data/cleanlab/plot_ids.json", "r") as file:
    int_list = json.load(file)

# Verify the list was loaded correctly
print(int_list)

In [None]:
df = pd.read_csv("../data/ceo-plantations-train-v08"