# Adult Income Dataset - Random Forest Classification

This notebook demonstrates training and evaluating a Random Forest classifier on the UCI Adult Income dataset. It includes data loading, preprocessing, hyperparameter tuning with cross-validation, and test set evaluation to predict whether an individual's income exceeds $50K/year.

In [None]:
# Import necessary libraries for data loading, model training, and evaluation
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# Load training and test data from LIBSVM format files
X_train, y_train = load_svmlight_file('../data/a9a')
X_test, y_test = load_svmlight_file('../data/a9a.t')

# Convert sparse matrices to dense arrays for scikit-learn
X_train = X_train.toarray()
X_test = X_test.toarray()

In [None]:
# Define the hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None]
}

In [None]:
# Perform grid search with 5-fold cross-validation to find the best hyperparameters
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
# Display the best hyperparameters found during grid search
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Retrieve the best model and make predictions on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate and display the accuracy of the model on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)