In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [10]:
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
        "hours-per-week", "native-country", "income"]
data = pd.read_csv('/Users/swithana/git/icicle_model_card/tests/data/adult/adult.data', names=columns)

In [11]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [12]:
# Drop rows with missing values
data = data.dropna()

# Encode categorical features using LabelEncoder
label_encoders = {}
for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Split features and target variable
X = data.drop('income', axis=1)
y = data['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5514,33,2,198183,9,13,4,10,1,4,0,0,0,50,39
19777,36,4,86459,8,11,2,4,0,4,1,0,1887,50,39
10781,58,6,203039,6,5,5,3,1,4,1,0,0,40,39
32240,21,4,180190,8,11,2,5,0,4,1,0,0,46,39
9876,27,4,279872,15,10,0,8,1,4,1,0,0,40,39


In [13]:
# Initialize and train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [14]:
# Make predictions on the test set
predictions = clf.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.86
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      4942
           1       0.75      0.64      0.69      1571

    accuracy                           0.86      6513
   macro avg       0.82      0.79      0.80      6513
weighted avg       0.86      0.86      0.86      6513



## Model Card Generation

In [16]:
from icicle_model_card.icicle_model_card import ModelCard, AIModel, BiasAnalysis, ExplainabilityAnalysis, validate_mc, Metric
import json

In [17]:
mc = ModelCard(
            name="UCI Adult Data Analysis via Random Forest",
            version="0.1",
            short_description="UCI Adult Data analysis using SKLearn and Random Forest",
            full_description="Using a Random Forest to train on UCI Adult Data Analysis",
            keywords="uci adult, sklearn, random_forest, explainability, fairness, fairlearn, shap",
            author="Sachith Withana"
        )

mc.input_data = 'https://archive.ics.uci.edu/dataset/2/adult'

In [23]:
model_metrics = []
model_metrics.append(Metric("Test loss", 0.7))
model_metrics.append(Metric("Test Accuracy", accuracy))

ai_model = AIModel(
            name="UCI Adult Random Forest model",
            version="0.1",
            description="Census classification problem using Random Forest",
            owner="Sachith Withana",
            location="github.com/swsachith/ai-model/random-forest",
            licence="BSD-3 Clause",
            model_structure = None
        )
ai_model.metrics = model_metrics
mc.ai_model = ai_model

In [24]:
mc.populate_bias(X_test, y_test, predictions, "gender", X_test['sex'], clf)

  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)


In [25]:
print(mc)

{
  "name": "UCI Adult Data Analysis",
  "version": "0.1",
  "short_description": "UCI Adult Data analysis using Tensorflow",
  "full_description": "Using a tensorflow trained neural network to analyse fairness and explainability in the UCI Adult Dataset",
  "keywords": "uci adult, tensorflow, explainability, fairness, fairlearn, shap",
  "author": "Sachith Withana",
  "input_data": "https://archive.ics.uci.edu/dataset/2/adult",
  "output_data": "",
  "ai_model": {
    "name": "UCI Adult tensorflow model",
    "version": "0.1",
    "description": "Census classification problem using Neural Network",
    "owner": "Sachith Withana",
    "location": "github.com/swsachith/ai-model",
    "licence": "BSD-3 Clause",
    "model_structure": null,
    "metrics": [
      {
        "key": "Test loss",
        "value": 0.5
      },
      {
        "key": "Test Accuracy",
        "value": 0.8616612927990174
      }
    ]
  },
  "bias_analysis": {
    "demographic_parity_diff": 0.18807341711946762,
 