In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
data = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values="?")

# Ethical consideration: Assess potential biases in the dataset
print(data['race'].value_counts(normalize=True))
print(data['sex'].value_counts(normalize=True))



race
White                 0.854274
Black                 0.095943
Asian-Pac-Islander    0.031909
Amer-Indian-Eskimo    0.009551
Other                 0.008323
Name: proportion, dtype: float64
sex
Male      0.669205
Female    0.330795
Name: proportion, dtype: float64


In [5]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Using cached imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Using cached imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.ensemble import BalancedRandomForestClassifier

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
data = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values="?")

# Ethical consideration: Assess potential biases in the dataset
print(data['race'].value_counts(normalize=True))
print(data['sex'].value_counts(normalize=True))

# Data preprocessing
data = data.dropna()  # Ethical consideration: Document the impact of dropping missing values

# Feature engineering
data['income'] = (data['income'] == '>50K').astype(int)

# Split the data into features (X) and target (y)
X = data.drop('income', axis=1)
y = data['income']

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=[
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize and train the Balanced Random Forest model
model = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Balanced Random Forest Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['<=50K', '>50K'], yticklabels=['<=50K', '>50K'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Balanced Random Forest')
plt.show()


race
White                 0.854274
Black                 0.095943
Asian-Pac-Islander    0.031909
Amer-Indian-Eskimo    0.009551
Other                 0.008323
Name: proportion, dtype: float64
sex
Male      0.669205
Female    0.330795
Name: proportion, dtype: float64


  warn(
  warn(
  warn(


Balanced Random Forest Accuracy: 0.8129

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.80      0.86      4503
           1       0.59      0.85      0.70      1530

    accuracy                           0.81      6033
   macro avg       0.77      0.82      0.78      6033
weighted avg       0.85      0.81      0.82      6033



ValueError: object __array__ method not producing an array

ValueError: object __array__ method not producing an array

<Figure size 640x480 with 2 Axes>