Data Source: https://archive.ics.uci.edu/dataset/2/adult

In [1]:
# Importing necessary libraries for data analysis
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# reading the data file
inc = pd.read_fwf("adult.data", header=None,names=['Age', 'Other data'])
print(inc.shape)
inc.head()

(32561, 2)


Unnamed: 0,Age,Other data
0,39,"State-gov, 77516, Bachelors, 13, Never-married..."
1,50,"Self-emp-not-inc, 83311, Bachelors, 13, Marrie..."
2,38,"Private, 215646, HS-grad, 9, Divorced, Handler..."
3,53,"Private, 234721, 11th, 7, Married-civ-spouse, ..."
4,28,"Private, 338409, Bachelors, 13, Married-civ-sp..."


### Cleaning Dataset

In [3]:
# Split the 'Other data' column on commas
new_cols = inc['Other data'].str.split(',', expand=True)

# Remove leading/trailing whitespace from each new column
new_cols = new_cols.apply(lambda x: x.str.strip())

# Rename the new columns
new_cols.columns = ['Workclass', 'FNLWGT', 'Education', 'Education-Num',
                    'Marital-Status', 'Occupation', 'Relationship', 'Race',
                    'Sex', 'Capital-Gain', 'Capital-Loss', 'Hours-per-week',
                    'Native-Country', 'Income'][:new_cols.shape[1]]  # slice in case fewer columns

# Combine with 'Age' column
inc_cleaned = pd.concat([inc['Age'], new_cols], axis=1)
# Remove commas and convert to integer
inc_cleaned['Age'] = inc_cleaned['Age'].str.replace(',', '')
inc_cleaned.head(3)

Unnamed: 0,Age,Workclass,FNLWGT,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Native-Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [4]:
# Checking for missing values
inc_cleaned.isnull().sum()

Age                 0
Workclass           0
FNLWGT              0
Education           0
Education-Num       0
Marital-Status      0
Occupation          0
Relationship        0
Race                0
Sex                 0
Capital-Gain        0
Capital-Loss        0
Hours-per-week      0
Native-Country      1
Income            121
dtype: int64

In [5]:
inc_cleaned.dropna(inplace = True)
inc_cleaned.shape

(32440, 15)

### Feature Engineering

In [6]:
# Convert numeric columns to int type
my_list = ["Age", "Education-Num", "Capital-Gain", 'Capital-Loss', 'Hours-per-week']

for col in my_list:
    inc_cleaned[col] = inc_cleaned[col].astype(int)
type(inc_cleaned.iloc[1,0])

numpy.int64

In [7]:
# brief look at the dataset so far
inc_cleaned.head(3)

Unnamed: 0,Age,Workclass,FNLWGT,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Native-Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [8]:
# Label encoding encoding
from sklearn.preprocessing import OrdinalEncoder

# Exclude 'Income' from encoding
cat_cols = inc_cleaned.select_dtypes(include='object').columns.drop('Income')

# Copy to avoid modifying original
inc_lencoded = inc_cleaned.copy()

# Apply encoding only to categorical features (excluding Income)
encoder = OrdinalEncoder()
inc_lencoded[cat_cols] = encoder.fit_transform(inc_lencoded[cat_cols])

In [9]:
# Inspecting target feature
inc_lencoded['Income'].value_counts()

Income
<=50K    24179
>50K      7616
<=50       146
<=5        119
            92
<=          72
>50         70
<           59
>           45
>5          42
Name: count, dtype: int64

In [10]:
# Keep only rows where 'Income' is exactly '>50K' or '<=50K'
inc_lencoded = inc_lencoded[inc_lencoded['Income'].isin(['>50K', '<=50K'])]
inc_lencoded['Income'] = (inc_lencoded['Income'] == '>50K').astype(int)

inc_lencoded.shape

(31795, 15)

### Model Building

In [11]:
# Import necessary libraries for ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Prepare predictors (X) and target (y)
X = inc_lencoded.drop('Income', axis=1)
y = inc_lencoded['Income']
X_np = X.values
y_np = y.values

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

#### Manually coded random forest

In [13]:
from sklearn.utils import resample
from collections import Counter
'''
BOOTSTRAPPING
'''
# Generate a bootstrap sample from the dataset.
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    return X[indices], y[indices]

'''
BUILDING A DECISION TREE
'''

# Function to compute the Gini impurity
def gini_impurity(arr):
    classes, counts = np.unique(arr, return_counts=True)
    probabilities = counts / counts.sum()
    gi = 1 - np.sum(probabilities ** 2)
    return gi

# Function to compute the weighted average Gini impurity
def weighted_avg_gi(y_left, y_right, n):
    gini_left = gini_impurity(y_left)
    gini_right = gini_impurity(y_right)

    n_left = len(y_left)
    n_right = len(y_right)
    weighted_gini = (n_left / n) * gini_left + (n_right / n) * gini_right
    
    return weighted_gini

# Find the best split point for a continuous feature column X_col with respect to the label vector y, 
# using the Gini impurity criterion.

def best_gini_split(X_col, y):
    # Sort feature values and corresponding labels
    sorted_idx = np.argsort(X_col)
    X_sorted = X_col[sorted_idx]
    y_sorted = y[sorted_idx]
    
    # Possible split points are midpoints between unique values
    unique_vals = np.unique(X_sorted)
    if len(unique_vals) == 1:
        return None, None  # No split possible

    thresholds = (unique_vals[:-1] + unique_vals[1:]) / 2

    best_gini = float('inf')
    best_threshold = None

    for threshold in thresholds:
        left_mask = X_sorted <= threshold
        right_mask = X_sorted > threshold

        y_left = y_sorted[left_mask]
        y_right = y_sorted[right_mask]

        if len(y_left) == 0 or len(y_right) == 0:
            continue  # Skip invalid splits

        gini_left = gini_impurity(y_left)
        gini_right = gini_impurity(y_right)

        # Weighted average Gini impurity
        n = len(y_sorted)
        weighted_gini = weighted_avg_gi(y_left, y_right, n)

        if weighted_gini < best_gini:
            best_gini = weighted_gini
            best_threshold = threshold

    return best_threshold, best_gini

def build_decision_tree(X, y, max_depth=None, min_samples_leaf=1, depth=0, max_features=None):
    n_samples, n_features = X.shape
    
    current_gini = gini_impurity(y)
    majority_class = np.bincount(y).argmax()

    # Stopping conditions
    if (max_depth is not None and depth >= max_depth) or \
       (n_samples <= min_samples_leaf) or (current_gini == 0):
        return {'type': 'leaf', 'class': majority_class}

    # Determine number of features to consider at this split
    if max_features is None:
        features_to_consider = np.arange(n_features)
    elif max_features == 'sqrt':
        k = max(1, int(np.sqrt(n_features)))
        features_to_consider = np.random.choice(n_features, size=k, replace=False)
    elif isinstance(max_features, int):
        k = max(1, min(max_features, n_features))
        features_to_consider = np.random.choice(n_features, size=k, replace=False)
    else:
        raise ValueError("max_features must be None, 'sqrt', or an int")

    # Find best split among chosen features
    best_feature = None
    best_threshold = None
    best_gini = current_gini
    for feature in features_to_consider:
        X_col = X[:, feature]
        threshold, split_gini = best_gini_split(X_col, y)

        if threshold is not None and split_gini < best_gini:
            best_gini = split_gini
            best_feature = feature
            best_threshold = threshold

    if best_feature is None or best_gini >= current_gini:
        return {'type': 'leaf', 'class': majority_class}

    # Recursive splitting
    left_mask = X[:, best_feature] <= best_threshold
    right_mask = ~left_mask

    left_subtree = build_decision_tree(
        X[left_mask], y[left_mask],
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        depth=depth + 1,
        max_features=max_features
    )
    right_subtree = build_decision_tree(
        X[right_mask], y[right_mask],
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        depth=depth + 1,
        max_features=max_features
    )

    return {
        'type': 'node',
        'feature': best_feature,
        'threshold': best_threshold,
        'left': left_subtree,
        'right': right_subtree
    }

# Predict class labels for all samples in X using the decision tree.
def predict_tree(tree, x):
    while tree['type'] != 'leaf':
        feature = tree['feature']
        threshold = tree['threshold']
        if x[feature] <= threshold:
            tree = tree['left']
        else:
            tree = tree['right']
    return tree['class']

In [14]:
'''
BUILDING RANDOM FOREST
'''
# Builds a random forest using n_trees individual decision trees.
def build_random_forest(X, y, n_trees=10, max_depth=None, max_features='sqrt'):
    forest = []
    for _ in range(n_trees):
        X_sample, y_sample = bootstrap_sample(X, y)
        tree = build_decision_tree(X_sample, y_sample, max_depth=max_depth, max_features=max_features)
        forest.append(tree)
    return forest


In [15]:
manual_rf = build_random_forest(X_train, y_train, n_trees=10, max_depth= None, max_features='sqrt')

#### Random forest coded using sklearn

In [30]:
# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=10, bootstrap = True, max_features = 'sqrt', random_state=42)

# Fit the model
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=10,
                       random_state=42)

### Predictions

#### Manually coded random forest predictions

In [17]:
#     Predicts using all trees in the forest and returns the majority vote.
def predict_forest(trees, X):
    # Get predictions from each tree
    tree_preds = np.array([[predict_tree(tree, x) for tree in trees] for x in X])
    
    # Majority vote across rows
    y_pred = []
    for row in tree_preds:
        majority_vote = Counter(row).most_common(1)[0][0]
        y_pred.append(majority_vote)
    
    return np.array(y_pred)



In [18]:
y_pred_manual = predict_forest(manual_rf, X_test)

#### Random forest  predictions using sklearn

In [31]:
# Make predictions
y_pred = rf.predict(X_test)

### Model Evaluations

In [35]:
Counter(y_test)

Counter({0: 4835, 1: 1524})

#### Evaluating manually coded random forest predictions

In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred_manual)
print(f"Accuracy: {accuracy:.2f}")

print(confusion_matrix(y_test, y_pred_manual))
print(classification_report(y_test, y_pred_manual))

Accuracy: 0.85
[[4492  343]
 [ 596  928]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      4835
           1       0.73      0.61      0.66      1524

    accuracy                           0.85      6359
   macro avg       0.81      0.77      0.78      6359
weighted avg       0.85      0.85      0.85      6359



#### Evaluating sklearn random forest predictions

In [32]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.85
[[4617  218]
 [ 706  818]]
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4835
           1       0.79      0.54      0.64      1524

    accuracy                           0.85      6359
   macro avg       0.83      0.75      0.77      6359
weighted avg       0.85      0.85      0.84      6359

