In [3]:
import pandas as pd

# Load the dataset from the UCI ML repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
df = pd.read_csv(url, header=None, names=columns, na_values=' ?')

# Display the first few rows
print(df.head())


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [4]:
# Check for missing values
print(df.isnull().sum())

# Display basic statistics
print(df.describe())

# Check the data types of the columns
print(df.dtypes)


age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
ma

In [10]:
# Drop rows with missing values
df.dropna(inplace=True)

In [8]:
# Apply One-Hot Encoding
df = pd.get_dummies(df, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])

# Display the dataframe after encoding
print(df.head())


   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   income  workclass_ Federal-gov  workclass_ Local-gov  workclass_ Private  \
0   <=50K                   False                 False               False   
1   <=50K                   False                 False               False   
2   <=50K                   False                 False                True   
3   <=50K                   False                 False                True   
4   <=50K                   False                 False                True   

   ...  native_country_ Portugal  native_country_ Puerto-Rico 

In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to the 'income' column
df['income'] = le.fit_transform(df['income'])

# Display the dataframe after encoding
print(df.head())


   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   income  workclass_ Federal-gov  workclass_ Local-gov  workclass_ Private  \
0       0                   False                 False               False   
1       0                   False                 False               False   
2       0                   False                 False                True   
3       0                   False                 False                True   
4       0                   False                 False                True   

   ...  native_country_ Portugal  native_country_ Puerto-Rico 

In [13]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Standardize numerical columns
df[['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']] = scaler.fit_transform(df[['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']])

# Display the dataframe after standardization
print(df.head())


        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.042796 -1.062722       1.128918      0.146092     -0.218586   
1  0.880288 -1.007871       1.128918     -0.147445     -0.218586   
2 -0.033340  0.244693      -0.439738     -0.147445     -0.218586   
3  1.108695  0.425240      -1.224066     -0.147445     -0.218586   
4 -0.794697  1.406658       1.128918     -0.147445     -0.218586   

   hours_per_week  income  workclass_ Federal-gov  workclass_ Local-gov  \
0       -0.077734       0                   False                 False   
1       -2.331531       0                   False                 False   
2       -0.077734       0                   False                 False   
3       -0.077734       0                   False                 False   
4       -0.077734       0                   False                 False   

   workclass_ Private  ...  native_country_ Portugal  \
0               False  ...                     False   
1               False  ...  

In [14]:
from sklearn.model_selection import train_test_split

# Define the feature matrix X and the target variable y
X = df.drop('income', axis=1)
y = df['income']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (24129, 104)
X_test shape: (6033, 104)
y_train shape: (24129,)
y_test shape: (6033,)


In [15]:
# Check the first few rows of the training data
print(X_train.head())
print(y_train.head())

# Check if there are any missing values
print(X_train.isnull().sum())
print(X_test.isnull().sum())


            age    fnlwgt  education_num  capital_gain  capital_loss  \
19863  1.108695 -0.201179      -2.008395     -0.147445     -0.218586   
24342  0.804152 -1.258413       1.128918     -0.147445     -0.218586   
10027 -0.794697 -0.333388      -0.047574     -0.147445     -0.218586   
25710  1.641645 -0.014745      -1.616231     -0.147445     -0.218586   
13824  1.108695 -0.971459      -0.439738     -0.147445     -0.218586   

       hours_per_week  workclass_ Federal-gov  workclass_ Local-gov  \
19863        2.426484                   False                 False   
24342        2.426484                   False                 False   
10027       -0.077734                   False                 False   
25710       -0.077734                   False                 False   
13824       -0.077734                   False                 False   

       workclass_ Private  workclass_ Self-emp-inc  ...  \
19863               False                    False  ...   
24342               Fa

ModuleNotFoundError: No module named 'category_encoders'