# loading and preprocessing data

In [4]:
# Option 1: Using OpenML via scikit-learn
from sklearn.datasets import fetch_openml
import pandas as pd

# Load dataset from OpenML
adult = fetch_openml(name="adult", version=2, as_frame=True)
df = adult.frame

print(df.head())
print(df.shape) 


   age  workclass  fnlwgt     education  education-num      marital-status  \
0   25    Private  226802          11th              7       Never-married   
1   38    Private   89814       HS-grad              9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm             12  Married-civ-spouse   
3   44    Private  160323  Some-college             10  Married-civ-spouse   
4   18        NaN  103497  Some-college             10       Never-married   

          occupation relationship   race     sex  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                NaN    Own-child  White  Female             0             0   

   hours-per-week native-country  class  
0       

In [5]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  int64   
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capital-gain    48842 non-null  int64   
 11  capital-loss    48842 non-null  int64   
 12  hours-per-week  48842 non-null  int64   
 13  native-country  47985 non-null  category
 14  class           48842 non-null  category
dtypes: category(9), int64(6)
memory usage: 2.7 MB


In [7]:
df = df.dropna()


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             45222 non-null  int64   
 1   workclass       45222 non-null  category
 2   fnlwgt          45222 non-null  int64   
 3   education       45222 non-null  category
 4   education-num   45222 non-null  int64   
 5   marital-status  45222 non-null  category
 6   occupation      45222 non-null  category
 7   relationship    45222 non-null  category
 8   race            45222 non-null  category
 9   sex             45222 non-null  category
 10  capital-gain    45222 non-null  int64   
 11  capital-loss    45222 non-null  int64   
 12  hours-per-week  45222 non-null  int64   
 13  native-country  45222 non-null  category
 14  class           45222 non-null  category
dtypes: category(9), int64(6)
memory usage: 2.8 MB


In [9]:
# Separate features and target
X = df.drop(columns="class")
y = df["class"]

In [10]:
cols = [col for col in X.columns if X[col].dtype == 'category']
cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in cols:
    X[col]=label_encoder.fit_transform(X[col])


In [12]:
cols_num=[col for col in X.columns if X[col].dtype == 'int64']

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for col in cols_num:
    X[col]=scaler.fit_transform(X[[col]])

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

X_val,y_val,X_test,y_test=train_test_split(X_val,y_val,test_size=0.5, random_state=42,stratify=y_val)

In [21]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,-1.024983,-0.213443,0.350889,-2.439977,-1.221559,0.942936,0.007557,0.993798,-2.018744,0.693813,-0.146733,-0.21878,-0.078120,0.262999
1,-0.041455,-0.213443,-0.945878,0.179902,-0.438122,-0.390005,-0.489170,-0.884479,0.384110,0.693813,-0.146733,-0.21878,0.754701,0.262999
2,-0.798015,-1.257163,1.393592,-0.868050,0.737034,-0.390005,1.001011,-0.884479,0.384110,0.693813,-0.146733,-0.21878,-0.078120,0.262999
3,0.412481,-0.213443,-0.278420,1.227853,-0.046403,-0.390005,0.007557,-0.884479,-2.018744,0.693813,0.877467,-0.21878,-0.078120,0.262999
5,-0.344079,-0.213443,0.084802,-2.701964,-1.613277,0.942936,0.255921,-0.258387,0.384110,0.693813,-0.146733,-0.21878,-0.910942,0.262999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.873671,-0.213443,0.639611,-0.868050,0.737034,-0.390005,1.497739,2.245982,0.384110,-1.441310,-0.146733,-0.21878,-0.244684,0.262999
48838,0.109857,-0.213443,-0.334735,0.179902,-0.438122,-0.390005,0.007557,-0.884479,0.384110,0.693813,-0.146733,-0.21878,-0.078120,0.262999
48839,1.471665,-0.213443,-0.358060,0.179902,-0.438122,2.275877,-1.482624,1.619890,0.384110,-1.441310,-0.146733,-0.21878,-0.078120,0.262999
48840,-1.251951,-0.213443,0.111279,0.179902,-0.438122,0.942936,-1.482624,0.993798,0.384110,0.693813,-0.146733,-0.21878,-1.743763,0.262999


# optimizers

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam
import numpy as np

In [27]:
# Build a simple neural network
model = Sequential()

model.add(Dense(64, activation='relu', input_shape=(15,)))  # 20 features in the input

model.add(Dense(10, activation='softmax'))

# Compile the model using SGD 

model.compile(optimizer= SGD(learning_rate=0.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with the training data and validate on validation data
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=16)

# Output the result after training
print(f"Training completed after {len(history.epoch)} epochs.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid dtype: category

In [28]:
model = Sequential()

model.add(Dense(64, activation='relu', input_shape=(15,)))  # 20 features in the input

model.add(Dense(10, activation='softmax'))

# Compile the model using SGD with Momentum
model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with the training data and validate on validation data
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=16)

# Output the result after training
print(f"Training completed after {len(history.epoch)} epochs.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid dtype: category

In [29]:
model = Sequential()

model.add(Dense(64, activation='relu', input_shape=(100,)))  # 100 features in the input

model.add(Dense(10, activation='softmax'))

# Compile the model using Adam optimizer

model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with the training data and validate on validation data
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50)

# Output the result after training
print(f"Training completed after {len(history.epoch)} epochs.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid dtype: category