### Adults data set

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the Adult dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
adult = pd.read_csv(url, header=None, index_col=False,
                    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                           'hours-per-week', 'native-country', 'income'])

adult.head(100)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,<=50K
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,<=50K


In [11]:
# Convert categorical variables to numeric
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                       'native-country', 'income']

for column in categorical_columns:
    adult[column] = adult[column].astype('category').cat.codes

In [12]:
adult.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,3.868892,189778.4,10.29821,10.080679,2.611836,6.57274,1.446362,3.665858,0.669205,1077.648844,87.30383,40.437456,36.718866,0.24081
std,13.640433,1.45596,105550.0,3.870264,2.57272,1.506222,4.228857,1.606771,0.848806,0.470506,7385.292085,402.960219,12.347429,7.823782,0.427581
min,17.0,0.0,12285.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,28.0,4.0,117827.0,9.0,9.0,2.0,3.0,0.0,4.0,0.0,0.0,0.0,40.0,39.0,0.0
50%,37.0,4.0,178356.0,11.0,10.0,2.0,7.0,1.0,4.0,1.0,0.0,0.0,40.0,39.0,0.0
75%,48.0,4.0,237051.0,12.0,12.0,4.0,10.0,3.0,4.0,1.0,0.0,0.0,45.0,39.0,0.0
max,90.0,8.0,1484705.0,15.0,16.0,6.0,14.0,5.0,4.0,1.0,99999.0,4356.0,99.0,41.0,1.0


In [13]:
adult.head(100)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,2,115585,15,10,4,6,1,4,1,0,0,50,39,0
96,48,6,191277,10,16,2,10,0,4,1,0,1902,60,39,1
97,37,4,202683,15,10,2,12,0,4,1,0,0,48,39,1
98,48,4,171095,7,12,0,4,4,4,0,0,0,40,9,0


In [39]:
# Choose two features: age and education level (education-num)
X1 = adult[['capital-gain']]
X2 = adult[['capital-gain', 'capital-loss']]
X3 = adult[['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                           'hours-per-week', 'native-country']]
y = adult['income']

# Split the data into training and test sets
X1_train, X1_test, y_train1, y_test1 = train_test_split(X1, y, test_size=0.2, random_state=42)
X2_train, X2_test, y_train2, y_test2 = train_test_split(X2, y, test_size=0.2, random_state=42)
X3_train, X3_test, y_train3, y_test3 = train_test_split(X3, y, test_size=0.2, random_state=42)

# Create two logistic regression models
model1 = LogisticRegression(random_state=42)
model2 = LogisticRegression(random_state=42)
model3 = LogisticRegression(random_state=42)

# Train the models
model1.fit(X1_train, y_train1)
model2.fit(X2_train, y_train2)
model3.fit(X3_train, y_train3)

# Make predictions on the test sets
y_pred1 = model1.predict(X1_test)
y_pred2 = model2.predict(X2_test)
y_pred3 = model3.predict(X3_test)

# Calculate the accuracy of the models
accuracy1 = accuracy_score(y_test1, y_pred1)
accuracy2 = accuracy_score(y_test2, y_pred2)
accuracy3 = accuracy_score(y_test3, y_pred3)

print("Model 1 accuracy:", accuracy1)
print("Model 2 accuracy:", accuracy2)
print("Model 3 accuracy:", accuracy3)

Model 1 accuracy: 0.8000921234454169
Model 2 accuracy: 0.8136035621065562
Model 3 accuracy: 0.7894979272224781
