In [1]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

#Setting seeds for reproducibility
import random
torch.manual_seed(42)
random.seed(0)
np.random.seed(0)

## Preprocessing

In [2]:
# Load Data
file_path = "/Users/zahrakhatti/Desktop/Research/codes/classification/1. Classification/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
           'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv(file_path, header=None, skipinitialspace=True, names=columns)
data = df.copy()
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data['workclass']=data['workclass'].replace('?',np.nan)
data['occupation']=data['occupation'].replace('?',np.nan)
data['native-country']=data['native-country'].replace('?',np.nan)

In [4]:
data.dropna(how='any',inplace=True)
# data = data.drop_duplicates()

In [5]:
data = data.drop(['education-num', 'capital-gain', 'capital-loss', 'fnlwgt'], axis=1)
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [6]:

data['income'] = data['income'].map({'>50K': 1, '<=50K': 0})

# Turning categorical to numerical 
label_encoder = preprocessing.LabelEncoder()
data['workclass'] = label_encoder.fit_transform(data['workclass'])
data['education'] = label_encoder.fit_transform(data['education'])
data['marital-status'] = label_encoder.fit_transform(data['marital-status'])
data['occupation'] = label_encoder.fit_transform(data['occupation'])
data['relationship'] = label_encoder.fit_transform(data['relationship'])
data['race'] = label_encoder.fit_transform(data['race'])
data['sex'] = label_encoder.fit_transform(data['sex'])
data['native-country'] = label_encoder.fit_transform(data['native-country'])
data['income'] = label_encoder.fit_transform(data['income'])

data.head()




Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,5,9,4,0,1,4,1,40,38,0
1,50,4,9,2,3,0,4,1,13,38,0
2,38,2,11,0,5,1,4,1,40,38,0
3,53,2,1,2,5,0,2,1,40,38,0
4,28,2,9,2,9,5,2,0,40,4,0


In [7]:
# Hyperparameters
hidden_size = 100
num_epochs = 5
learning_rate = 0.001
batch_size = 50
num_classes = 1

# Defining x and y

y = data['income']
x_before = data.drop('income', axis=1)

# Scaling all of the features
scaler = StandardScaler()
x = scaler.fit_transform(x_before)
x = pd.DataFrame(x, columns=x_before.columns)
x

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,0.042796,2.936000,-0.349865,0.947847,-1.479055,-0.261249,0.385048,0.692806,-0.077734,0.264924
1,0.880288,1.887682,-0.349865,-0.387275,-0.734545,-0.885737,0.385048,0.692806,-2.331531,0.264924
2,-0.033340,-0.208955,0.174763,-1.722396,-0.238206,-0.261249,0.385048,0.692806,-0.077734,0.264924
3,1.108695,-0.208955,-2.448375,-0.387275,-0.238206,-0.885737,-2.011035,0.692806,-0.077734,0.264924
4,-0.794697,-0.208955,-0.349865,-0.387275,0.754473,2.236703,-2.011035,-1.443405,-0.077734,-5.304034
...,...,...,...,...,...,...,...,...,...,...
30157,-0.870832,-0.208955,-0.874492,-0.387275,1.498983,2.236703,0.385048,-1.443405,-0.244682,0.264924
30158,0.118931,-0.208955,0.174763,-0.387275,0.009964,-0.885737,0.385048,0.692806,-0.077734,0.264924
30159,1.489374,-0.208955,0.174763,2.282969,-1.479055,1.612215,0.385048,-1.443405,-0.077734,0.264924
30160,-1.251511,-0.208955,0.174763,0.947847,-1.479055,0.987727,0.385048,0.692806,-1.747213,0.264924


## Defining features and labels
## Splitting the data to train and test and transforming it to tensor

In [8]:
input_size = x.shape[1]
features = x.values.astype(float)

# Splitting the dataset into training and testing sets
features_train, features_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
train_tensor_x = torch.tensor(features_train).float()
train_tensor_y = torch.tensor(y_train.values.squeeze()).float() 
test_tensor_x = torch.tensor(features_test).float()
test_tensor_y = torch.tensor(y_test.values.squeeze()).float() 

train_dataset = torch.utils.data.TensorDataset(train_tensor_x, train_tensor_y)
test_dataset = torch.utils.data.TensorDataset(test_tensor_x, test_tensor_y)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
input_size

10

In [9]:
#iterator = iter(train_loader)
#x, y = next(iterator)
#print(x.shape, y.shape)

In [10]:
# Neural Network Model

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.w1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.w2 = nn.Linear(hidden_size, num_classes)
        self.tanh = nn.Tanh() 

    def forward(self, x):
        h = self.w1(x)
        sigma_h = self.relu(h)
        y_hat = self.w2(sigma_h)
        out = self.tanh(y_hat)  
        return out



# Model, Loss, and Optimizer
model = NeuralNet(input_size,hidden_size, num_classes)
criterion = nn.BCEWithLogitsLoss() 


optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#for p in model.parameters():
   # print(p)
s_mean = x['sex'].mean()
sex_column_index = x.columns.get_loc('sex')

# Training Loop
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):

        # Forward pass
        Y_hat = model(features)
        loss = criterion(Y_hat, labels.unsqueeze(1))
        s = features[:, sex_column_index]




        # Calculating covariance
        n = s.shape[0]
        covariance = torch.sum((s - s_mean) * (Y_hat)) / n
        loss += 1* torch.norm(covariance)**2



        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{n_total_steps}], Loss: {loss.item():.4f}')
            
    with torch.no_grad():
        correct = 0
        total = 0
        threshold = 0

        for inputs, labels in test_loader:
            outputs = model(inputs)
                
            # Applying threshold dynamically based on the specified threshold value
            predicted = torch.where(outputs >= threshold, torch.tensor(1), torch.tensor(0))
            total += labels.size(0)
            correct += (predicted == labels.unsqueeze(1).float()).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')
    print(torch.norm(covariance)**2)

Epoch [1/5], Step [100/483], Loss: 0.7520
Epoch [1/5], Step [200/483], Loss: 0.7014
Epoch [1/5], Step [300/483], Loss: 4.1565
Epoch [1/5], Step [400/483], Loss: 0.6974
Test Accuracy: 52.89%
tensor(1.1947e-05, grad_fn=<PowBackward0>)
Epoch [2/5], Step [100/483], Loss: 0.6890
Epoch [2/5], Step [200/483], Loss: 0.6833
Epoch [2/5], Step [300/483], Loss: 1.1802
Epoch [2/5], Step [400/483], Loss: 0.6922
Test Accuracy: 56.01%
tensor(0.0003, grad_fn=<PowBackward0>)
Epoch [3/5], Step [100/483], Loss: 0.6820
Epoch [3/5], Step [200/483], Loss: 0.6822
Epoch [3/5], Step [300/483], Loss: 0.8486
Epoch [3/5], Step [400/483], Loss: 0.6882
Test Accuracy: 60.63%
tensor(1.7668e-05, grad_fn=<PowBackward0>)
Epoch [4/5], Step [100/483], Loss: 0.6818
Epoch [4/5], Step [200/483], Loss: 0.6783
Epoch [4/5], Step [300/483], Loss: 1.1208
Epoch [4/5], Step [400/483], Loss: 0.6854
Test Accuracy: 62.74%
tensor(3.2231e-05, grad_fn=<PowBackward0>)
Epoch [5/5], Step [100/483], Loss: 0.6810
Epoch [5/5], Step [200/483], L

In [11]:
with torch.no_grad():
    correct = 0
    total = 0
    correct_male = 0
    total_male = 0
    correct_female = 0
    total_female = 0
    threshold = 0 

    for inputs, labels in test_loader:
        outputs = model(inputs)

        # Applying threshold dynamically based on the specified threshold value
        predicted = torch.where(outputs >= threshold, torch.tensor(1), torch.tensor(0))

        total += labels.size(0)
        correct += (predicted == labels.unsqueeze(1).float()).sum().item()

        sex = inputs[:, sex_column_index]

        # Calculate accuracy for male and female separately
        for i in range(len(sex)):
            if sex[i] >0 :  
                total_male += 1
                if predicted[i] == labels[i]:
                    correct_male += 1
            elif sex[i] <0:  
                total_female += 1
                if predicted[i] == labels[i]:
                    correct_female += 1

accuracy = 100 * correct / total
accuracy_male = 100 * correct_male / total_male if total_male > 0 else 0
accuracy_female = 100 * correct_female / total_female if total_female > 0 else 0

print(f'Test Accuracy: {accuracy:.2f}%')
print(f'Male Accuracy: {accuracy_male:.2f}%')
print(f'Female Accuracy: {accuracy_female:.2f}%')
print(total)
print(total_male + total_female)

Test Accuracy: 64.46%
Male Accuracy: 62.72%
Female Accuracy: 68.08%
6033
6033
