# AI Engineer Roadmap
## AI Jobs
### https://zazencodes.com/courses/ai-engineer-roadmap#ai-jobs

In [25]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing

# Step 2: Create synthetic dataset with loan application features
np.random.seed(42)  # Setting seed for reproducibility
data = pd.DataFrame({
    'income': np.random.normal(50000, 20000, 1000),
    'credit_score': np.random.normal(700, 50, 1000),
    'age': np.random.randint(18, 80, 1000),
    'gender': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] * 100, # Female=0, Male=1
    'approved': [1, 0, 0, 0, 0, 1, 1, 1, 0, 0] * 100, # Approved=1, Not approved=0
})

# Calculate approval rates by gender
approval_by_gender = data.groupby('gender')['approved'].mean()
print("\nApproval Rates by Gender:")
print(f"Female (0): {approval_by_gender[0]:.2f}")
print(f"Male (1): {approval_by_gender[1]:.2f}")
print(f"Difference: {abs(approval_by_gender[1] - approval_by_gender[0]):.2f}")

# Step 3: Convert to AIF360 dataset format for bias analysis
protected_attribute = 'gender'
label = 'approved'
features = ['income', 'credit_score', 'age']

binary_data = BinaryLabelDataset(
    df=data,
    label_names=[label],
    protected_attribute_names=[protected_attribute],
    favorable_label=1,
    unfavorable_label=0
)

# Step 4: Measure initial bias using available metrics
metrics = BinaryLabelDatasetMetric(binary_data,
                                 unprivileged_groups=[{'gender': 0}],
                                 privileged_groups=[{'gender': 1}])

# Initial Bias Metrics
initial_disparate_impact = metrics.disparate_impact()
initial_statistical_parity_difference = metrics.statistical_parity_difference()

# Step 5: Train model on original data to establish baseline
X_orig = data[features].values
y_orig = data[label].values

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_orig, y_orig, test_size=0.2, random_state=42
)

model_original = RandomForestClassifier(random_state=42)
model_original.fit(X_train_orig, y_train_orig)

# Evaluate original model
y_pred_orig = model_original.predict(X_test_orig)
accuracy_orig = accuracy_score(y_test_orig, y_pred_orig)

# Create a new DataFrame for original model test data
test_indices_orig = np.arange(len(data))[-len(X_test_orig):]
test_data_orig = pd.DataFrame({
    'gender': data.iloc[test_indices_orig]['gender'].values,
    'actual': y_test_orig,
    'predicted': y_pred_orig
})

# Calculate approval rates by gender in original model
orig_approval_by_gender = test_data_orig.groupby('gender')['predicted'].mean()
print("\nOriginal Model Approval Rates by Gender:")
print(f"Female (0): {orig_approval_by_gender[0]:.4f}")
print(f"Male (1): {orig_approval_by_gender[1]:.4f}")
print(f"Difference: {abs(orig_approval_by_gender[1] - orig_approval_by_gender[0]):.4f}")

# Step 6: Apply reweighing to mitigate bias
rw = Reweighing(unprivileged_groups=[{'gender': 0}],
                privileged_groups=[{'gender': 1}])
transformed_data = rw.fit_transform(binary_data)

# Step 7: Measure bias after transformation
metrics_transformed = BinaryLabelDatasetMetric(transformed_data,
                                             unprivileged_groups=[{'gender': 0}],
                                             privileged_groups=[{'gender': 1}])

# Bias Metrics After Reweighing
final_disparate_impact = metrics_transformed.disparate_impact()
final_statistical_parity_difference = metrics_transformed.statistical_parity_difference()

# Step 8: Prepare data for model training with reweighting
X = transformed_data.features
y = transformed_data.labels.ravel()
sample_weights = transformed_data.instance_weights.ravel()

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, sample_weights, test_size=0.2, random_state=42
)

# Step 9: Train Random Forest classifier with instance weights
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train, sample_weight=w_train)

# Step 10: Evaluate model accuracy and fairness after reweighting
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Save indices for test set to match with original data
test_indices = np.arange(len(X))[len(X_train):]

# Create a DataFrame with predictions on test data to analyze fairness
test_data = pd.DataFrame({
    'gender': transformed_data.protected_attributes[test_indices].ravel(),
    'actual': y_test,
    'predicted': y_pred
})

# Calculate approval rates by gender for reweighted model
approval_by_gender_rw = test_data.groupby('gender')['predicted'].mean()
print("\nReweighted Model Approval Rates by Gender:")
print(f"Female (0): {approval_by_gender_rw[0]:.4f}")
print(f"Male (1): {approval_by_gender_rw[1]:.4f}")
print(f"Difference: {abs(approval_by_gender_rw[1] - approval_by_gender_rw[0]):.4f}")

# Step 12: Summary
print("\nSummary:")
print("+" + "-"*50 + "+" + "-"*12 + "+")
print("|{:^50}|{:^12}|".format("Metric", "Value"))
print("+" + "-"*50 + "+" + "-"*12 + "+")
print("|{:<50}|{:>11.4f}|".format("Model Accuracy (original)", accuracy_orig))
print("|{:<50}|{:>11.4f}|".format("Model Accuracy (re-weighted)", accuracy))
print("|{:<50}|{:>11.4f}|".format("Disparate Impact (original)", initial_disparate_impact))
print("|{:<50}|{:>11.4f}|".format("Disparate Impact (re-weighted)", final_disparate_impact))
print("|{:<50}|{:>11.4f}|".format("Statistical Parity Difference (original)", initial_statistical_parity_difference))
print("|{:<50}|{:>11.4f}|".format("Statistical Parity Difference (re-weighted)", final_statistical_parity_difference))
print("|{:<50}|{:>11.4f}|".format("Gender Approval Difference (original)", abs(orig_approval_by_gender[1] - orig_approval_by_gender[0])))
print("|{:<50}|{:>11.4f}|".format("Gender Approval Difference (re-weighted)", abs(approval_by_gender_rw[1] - approval_by_gender_rw[0])))
print("+" + "-"*50 + "+" + "-"*12 + "+")



Approval Rates by Gender:
Female (0): 0.20
Male (1): 0.60
Difference: 0.40

Original Model Approval Rates by Gender:
Female (0): 0.2500
Male (1): 0.3600
Difference: 0.1100

Reweighted Model Approval Rates by Gender:
Female (0): 0.3700
Male (1): 0.3900
Difference: 0.0200

Summary:
+--------------------------------------------------+------------+
|                      Metric                      |   Value    |
+--------------------------------------------------+------------+
|Model Accuracy (original)                         |     0.5250|
|Model Accuracy (re-weighted)                      |     0.5500|
|Disparate Impact (original)                       |     0.3333|
|Disparate Impact (re-weighted)                    |     1.0000|
|Statistical Parity Difference (original)          |    -0.4000|
|Statistical Parity Difference (re-weighted)       |     0.0000|
|Gender Approval Difference (original)             |     0.1100|
|Gender Approval Difference (re-weighted)          |     0.0200|


In [52]:
# Step 1: Import required libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import shap

np.random.seed(137)

# Step 2: Create synthetic patient dataset with medical features
data = pd.DataFrame({
    'age': np.random.normal(50, 15, 1000),              # Mean age 50, std dev 15
    'blood_pressure': np.random.normal(120, 20, 1000),  # Mean BP 120, std dev 20
    'glucose': np.random.normal(100, 25, 1000),         # Mean glucose 100, std dev 25
    'heart_rate': np.random.normal(75, 12, 1000),       # Mean HR 75, std dev 12
    'cholesterol': np.random.normal(200, 40, 1000)      # Mean cholesterol 200, std dev 40
})

# Step 3: Generate disease labels based on glucose and blood pressure thresholds
labels = (data['glucose'] > 126) & (data['blood_pressure'] > 140)  # Medical criteria
data['disease'] = labels.astype(int)  # Convert boolean to 0/1

# Step 4: Prepare features and target for model training
features = ['age', 'blood_pressure', 'glucose', 'heart_rate', 'cholesterol']
X = data[features]
y = data['disease']

# Step 5: Train a Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Step 6: Create SHAP explainer for the trained model
explainer = shap.TreeExplainer(model)

# Step 7: Get SHAP values for first patient's prediction
patient = X.iloc[[0]]  # Select first patient
patient_outcome = y.iloc[0]
shap_values = explainer.shap_values(patient)

# Step 8: Print feature importance values for the prediction
for idx, feature in enumerate(features):
    print(f"{feature}: {shap_values[0][idx][patient_outcome]:.3f}")


age: -0.002
blood_pressure: -0.042
glucose: 0.054
heart_rate: 0.005
cholesterol: 0.001


In [53]:
patient

Unnamed: 0,age,blood_pressure,glucose,heart_rate,cholesterol
0,66.361632,169.690375,74.759275,77.575502,229.881296


In [54]:
patient_outcome

0

In [55]:
shap_values

array([[[-0.00197203,  0.00197203],
        [-0.04155179,  0.04155179],
        [ 0.05378676, -0.05378676],
        [ 0.00458327, -0.00458327],
        [ 0.0007238 , -0.0007238 ]]])