In [1]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
# Set seeds for reproducibility
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True  # If you are using GPU
torch.backends.cudnn.benchmark = False
import torchvision



## Preprocessing

In [2]:
# Load Data
file_path = "/Users/zahrakhatti/Desktop/Research/codes/classification/1. Classification/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
           'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv(file_path, header=None, skipinitialspace=True, names=columns)
data = df.copy()
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data['workclass']=data['workclass'].replace('?',np.nan)
data['occupation']=data['occupation'].replace('?',np.nan)
data['native-country']=data['native-country'].replace('?',np.nan)

In [4]:
data.dropna(how='any',inplace=True)
data = data.drop_duplicates()

In [5]:
data = data.drop(['education-num', 'capital-gain', 'capital-loss', 'fnlwgt'], axis=1)

In [6]:

data['income'] = data['income'].map({'>50K': 1, '<=50K': 0})
# Turning categorical to numerical 
label_encoder = preprocessing.LabelEncoder()
data['workclass'] = label_encoder.fit_transform(data['workclass'])
data['education'] = label_encoder.fit_transform(data['education'])
data['marital-status'] = label_encoder.fit_transform(data['marital-status'])
data['occupation'] = label_encoder.fit_transform(data['occupation'])
data['relationship'] = label_encoder.fit_transform(data['relationship'])
data['race'] = label_encoder.fit_transform(data['race'])
data['sex'] = label_encoder.fit_transform(data['sex'])
data['native-country'] = label_encoder.fit_transform(data['native-country'])
data['income'] = label_encoder.fit_transform(data['income'])

data.head()




Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,5,9,4,0,1,4,1,40,38,0
1,50,4,9,2,3,0,4,1,13,38,0
2,38,2,11,0,5,1,4,1,40,38,0
3,53,2,1,2,5,0,2,1,40,38,0
4,28,2,9,2,9,5,2,0,40,4,0


In [7]:
# Hyperparameters
hidden_size = 100
num_epochs = 5
learning_rate = 0.001
batch_size = 50
num_classes = 1

# Defining x and y

y = data['income']
x_before = data.drop('income', axis=1)
scaler = StandardScaler()
x = scaler.fit_transform(x_before)
x = pd.DataFrame(x, columns=x_before.columns)
x


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,0.042516,2.935011,-0.349701,0.948085,-1.478987,-0.261297,0.385108,0.692725,-0.078031,0.264666
1,0.880215,1.887014,-0.349701,-0.386977,-0.734570,-0.885672,0.385108,0.692725,-2.332060,0.264666
2,-0.033639,-0.208980,0.174959,-1.722039,-0.238292,-0.261297,0.385108,0.692725,-0.078031,0.264666
3,1.108678,-0.208980,-2.448340,-0.386977,-0.238292,-0.885672,-2.010755,0.692725,-0.078031,0.264666
4,-0.795183,-0.208980,-0.349701,-0.386977,0.754264,2.236200,-2.010755,-1.443574,-0.078031,-5.306510
...,...,...,...,...,...,...,...,...,...,...
30134,-0.871338,-0.208980,-0.874360,-0.386977,1.498681,2.236200,0.385108,-1.443574,-0.244996,0.264666
30135,0.118670,-0.208980,0.174959,-0.386977,0.009847,-0.885672,0.385108,0.692725,-0.078031,0.264666
30136,1.489450,-0.208980,0.174959,2.283147,-1.478987,1.611826,0.385108,-1.443574,-0.078031,0.264666
30137,-1.252110,-0.208980,0.174959,0.948085,-1.478987,0.987452,0.385108,0.692725,-1.747682,0.264666


### getting the sex column from the 'data' dataset

In [8]:
sex_column_index = data.columns.get_loc('sex')
x.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,0.042516,2.935011,-0.349701,0.948085,-1.478987,-0.261297,0.385108,0.692725,-0.078031,0.264666
1,0.880215,1.887014,-0.349701,-0.386977,-0.73457,-0.885672,0.385108,0.692725,-2.33206,0.264666
2,-0.033639,-0.20898,0.174959,-1.722039,-0.238292,-0.261297,0.385108,0.692725,-0.078031,0.264666
3,1.108678,-0.20898,-2.44834,-0.386977,-0.238292,-0.885672,-2.010755,0.692725,-0.078031,0.264666
4,-0.795183,-0.20898,-0.349701,-0.386977,0.754264,2.2362,-2.010755,-1.443574,-0.078031,-5.30651


## Defining features and labels
## Splitting the data to train and test and transforming it to tensor

In [11]:
# Splitting the dataset into training and testing sets
features_train, features_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
features_train.astype(float)
features_test.astype(float)
features_train_drop = features_train.drop('sex', axis=1).values.astype(float)
features_test_drop = features_test.drop('sex', axis=1).values.astype(float)

# Convert data to PyTorch tensors
train_tensor_x = torch.tensor(features_train).float()
train_tensor_y = torch.tensor(y_train.values.squeeze()).float() 
test_tensor_x = torch.tensor(features_test).float()
test_tensor_y = torch.tensor(y_test.values.squeeze()).float() 

train_dataset = torch.utils.data.TensorDataset(train_tensor_x, train_tensor_y)
test_dataset = torch.utils.data.TensorDataset(test_tensor_x, test_tensor_y)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
input_size = features_train_drop.shape[1]
input_size

ValueError: could not determine the shape of object type 'DataFrame'

In [None]:
# Neural Network Model

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.w1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.w2 = nn.Linear(hidden_size, num_classes)
        self.tanh = nn.Tanh() 

    def forward(self, x):
        h = self.w1(x)
        sigma_h = self.relu(h)
        y_hat = self.w2(sigma_h)
        out = self.tanh(y_hat)  
        return out

# Model, Loss, and Optimizer
model = NeuralNet(input_size,hidden_size, num_classes)
criterion = nn.BCEWithLogitsLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Training Loop
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        # Remove the 'sex' column from the features tensor
        features_modified = torch.cat((features[:, :sex_column_index], features[:, sex_column_index+1:]), dim=1)
        
        # Forward pass
        Y_hat = model(features_modified)
        loss = criterion(Y_hat.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{n_total_steps}], Loss: {loss.item():.4f}')
            
    with torch.no_grad():
        correct = 0
        total = 0
        threshold = 0

        for inputs, labels in test_loader:
            # Remove the 'sex' column from the inputs tensor
            inputs_modified = torch.cat((inputs[:, :sex_column_index], inputs[:, sex_column_index+1:]), dim=1)
            outputs = model(inputs_modified)

            # Applying threshold dynamically based on the specified threshold value
            predicted = torch.where(outputs >= threshold, torch.tensor(1), torch.tensor(0))

            total += labels.size(0)
            correct += (predicted.squeeze() == labels.float()).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')


Epoch [1/5], Step [100/483], Loss: 0.5454
Epoch [1/5], Step [200/483], Loss: 0.4394
Epoch [1/5], Step [300/483], Loss: 0.6008
Epoch [1/5], Step [400/483], Loss: 0.5735
Test Accuracy: 79.20%
Epoch [2/5], Step [100/483], Loss: 0.5195
Epoch [2/5], Step [200/483], Loss: 0.4103
Epoch [2/5], Step [300/483], Loss: 0.5810
Epoch [2/5], Step [400/483], Loss: 0.5672
Test Accuracy: 79.89%
Epoch [3/5], Step [100/483], Loss: 0.5164
Epoch [3/5], Step [200/483], Loss: 0.4037
Epoch [3/5], Step [300/483], Loss: 0.5796
Epoch [3/5], Step [400/483], Loss: 0.5627
Test Accuracy: 79.91%
Epoch [4/5], Step [100/483], Loss: 0.5138
Epoch [4/5], Step [200/483], Loss: 0.3995
Epoch [4/5], Step [300/483], Loss: 0.5799
Epoch [4/5], Step [400/483], Loss: 0.5589
Test Accuracy: 79.91%
Epoch [5/5], Step [100/483], Loss: 0.5129
Epoch [5/5], Step [200/483], Loss: 0.3964
Epoch [5/5], Step [300/483], Loss: 0.5794
Epoch [5/5], Step [400/483], Loss: 0.5558
Test Accuracy: 79.98%


In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    correct_male = 0
    total_male = 0
    correct_female = 0
    total_female = 0
    threshold = 0 

    for inputs, labels in test_loader:
        outputs = model(inputs)

        # Applying threshold dynamically based on the specified threshold value
        predicted = torch.where(outputs >= threshold, torch.tensor(1), torch.tensor(0))

        total += labels.size(0)
        correct += (predicted == labels.unsqueeze(1).float()).sum().item()

        sex = inputs[:, sex_column_index]

        # Calculate accuracy for male and female separately
        for i in range(len(sex)):
            if sex[i] >0 :  
                total_male += 1
                if predicted[i] == labels[i]:
                    correct_male += 1
            elif sex[i] <0:  
                total_female += 1
                if predicted[i] == labels[i]:
                    correct_female += 1

accuracy = 100 * correct / total
accuracy_male = 100 * correct_male / total_male if total_male > 0 else 0
accuracy_female = 100 * correct_female / total_female if total_female > 0 else 0

print(f'Test Accuracy: {accuracy:.2f}%')
print(f'Male Accuracy: {accuracy_male:.2f}%')
print(f'Female Accuracy: {accuracy_female:.2f}%')

Test Accuracy: 79.98%
Male Accuracy: 72.63%
Female Accuracy: 83.29%


In [None]:
sex_list = []
predictions_list = []
labels_list = []

with torch.no_grad():
    for i, (inputs, labels) in enumerate(test_loader):
        outputs = model(inputs)
        # Applying threshold dynamically based on the specified threshold value
        predicted = torch.where(outputs >= threshold, torch.tensor(1), torch.tensor(0))
        
        # Convert 'sex' column to PyTorch tensor
        sex_tensor = torch.tensor(features_test['sex'].values)
        sex_tensor = torch.where(sex_tensor >= threshold, torch.tensor(1), torch.tensor(0))
        sex_list.extend(sex_tensor[:len(predicted)].numpy())
        predictions_list.extend(predicted.flatten())
        labels_list.extend(labels.flatten())

# Converting lists to arrays
sex_column = np.array(sex_list)
predictions_column = np.array(predictions_list)
labels_column = np.array(labels_list)

# Creating a DataFrame
result = pd.DataFrame({
    'Sex': sex_column,
    'Predicted': predictions_column,
    'Actual': labels_column
})


# Calculating accuracy
total = labels_column.size
correct = np.sum(predictions_column == labels_column)

# Calculating overall accuracy
accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')

# Calculating accuracy for male and female separately
correct_male = result[(result['Sex'] == 1) & (result['Predicted'] == result['Actual'])].shape[0]
total_male = result[result['Sex'] == 1].shape[0]

correct_female = result[(result['Sex'] == 0) & (result['Predicted'] == result['Actual'])].shape[0]
total_female = result[result['Sex'] == 0].shape[0]

accuracy_male = 100 * correct_male / total_male if total_male > 0 else 0
accuracy_female = 100 * correct_female / total_female if total_female > 0 else 0

print(f'Male Accuracy: {accuracy_male:.2f}%')
print(f'Female Accuracy: {accuracy_female:.2f}%')

# Displaying three column table
print(result)


Test Accuracy: 79.98%
Male Accuracy: 80.69%
Female Accuracy: 78.59%
      Sex  Predicted  Actual
0       0          0     0.0
1       1          0     0.0
2       1          0     0.0
3       0          0     0.0
4       0          0     0.0
...   ...        ...     ...
6023    0          0     0.0
6024    0          0     0.0
6025    0          0     0.0
6026    1          0     1.0
6027    0          1     1.0

[6028 rows x 3 columns]
