In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


# Load the dataset
#url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
url = 'adult.data'
df = pd.read_csv(url, header=None)


# Replace missing values with NaN
df = df.replace(' ?', np.nan)

# Drop rows with missing values
df = df.dropna()

# Convert categorical features into numerical features
df[1] = pd.factorize(df[1])[0]
df[3] = pd.factorize(df[3])[0]
df[5] = pd.factorize(df[5])[0]
df[6] = pd.factorize(df[6])[0]
df[7] = pd.factorize(df[7])[0]
df[8] = pd.factorize(df[8])[0]
df[9] = pd.factorize(df[9])[0]
df[13] = pd.factorize(df[13])[0]

# Split the data into training and testing sets
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Decision Tree classifier
#dt = DecisionTreeClassifier(criterion="entropy", max_depth=3)
#dt = DecisionTreeClassifier()
#dt.fit(X_train, y_train)

# Train the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# Train the Support Vector Machine Classifier
svc = SVC(kernel='rbf', gamma='auto', random_state=42)
svc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_rfc = rfc.predict(X_test)
#y_pred_dt = dt.predict(X_test)
y_pred_svc = svc.predict(X_test)

# Calculate accuracy score
accuracy_rfc = accuracy_score(y_test, y_pred_rfc)
#accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_svc = accuracy_score(y_test, y_pred_svc)

# Calculate confusion matrix
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
#cm_dt = confusion_matrix(y_test, y_pred_dt)
cm_svc = confusion_matrix(y_test, y_pred_svc)

# Generate classification report
cr_rfc = classification_report(y_test, y_pred_rfc)
#cr_dt = classification_report(y_test, y_pred_dt)
cr_svc = classification_report(y_test, y_pred_svc)

# Print results
print("Random Forest Classifier:")
print("Accuracy score:", accuracy_rfc)
print("Confusion matrix:\n", cm_rfc)
print("Classification report:\n", cr_rfc)

#print("Decision Tree Classifier:")
#print("Accuracy score:", accuracy_dt)
#print("Confusion matrix:\n", cm_dt)
#print("Classification report:\n", cr_dt)

print("Support Vector Machine Classifier:")
print("Accuracy score:", accuracy_svc)
print("Confusion matrix:\n", cm_svc)
print("Classification report:\n", cr_svc)





Random Forest Classifier:
Accuracy score: 0.8547905845949829
Confusion matrix:
 [[6286  481]
 [ 833 1449]]
Classification report:
               precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91      6767
        >50K       0.75      0.63      0.69      2282

    accuracy                           0.85      9049
   macro avg       0.82      0.78      0.80      9049
weighted avg       0.85      0.85      0.85      9049

Support Vector Machine Classifier:
Accuracy score: 0.747375400596751
Confusion matrix:
 [[6762    5]
 [2281    1]]
Classification report:
               precision    recall  f1-score   support

       <=50K       0.75      1.00      0.86      6767
        >50K       0.17      0.00      0.00      2282

    accuracy                           0.75      9049
   macro avg       0.46      0.50      0.43      9049
weighted avg       0.60      0.75      0.64      9049

