In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/census.csv')

In [3]:
df.columns = df.columns.str.strip()
df.sample(10)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
5953,25,Private,152035,HS-grad,9,Never-married,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30042,67,?,132626,Some-college,10,Married-civ-spouse,?,Husband,White,Male,0,0,6,United-States,<=50K
20095,51,Self-emp-not-inc,168539,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K
6013,31,Self-emp-inc,133861,Assoc-voc,11,Divorced,Sales,Not-in-family,White,Male,0,0,40,United-States,<=50K
6954,47,Local-gov,121124,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,35,United-States,>50K
17704,24,State-gov,390867,Masters,14,Never-married,Prof-specialty,Not-in-family,Black,Female,0,0,40,United-States,<=50K
11804,42,Private,270721,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,32,United-States,<=50K
27478,61,Private,180632,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K
31665,41,Federal-gov,193882,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States,<=50K
1543,59,Local-gov,165695,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


In [4]:
#Convert Data Types
df['age'] = df['age'].astype(int)
df['fnlgt'] = df['fnlgt'].astype(int)
df['education-num'] = df['education-num'].astype(int)
df['capital-gain'] = df['capital-gain'].astype(int)
df['capital-loss'] = df['capital-loss'].astype(int)
df['hours-per-week'] = df['hours-per-week'].astype(int)


In [5]:
#Standardize Text Data
string_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'salary']
for col in string_columns:
    df[col] = df[col].str.strip().str.lower().replace('?', 'undefined')

In [24]:
df.to_csv('data/census_clean.csv', index=False)
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder


In [32]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.20)

In [34]:
#df.iloc[0].to_dict()

In [35]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country"]

In [40]:
def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
    """ Process the data used in the machine learning pipeline.

    Processes the data using one hot encoding for the categorical features and a
    label binarizer for the labels. This can be used in either training or
    inference/validation.

    Note: depending on the type of model used, you may want to add in functionality that
    scales the continuous data.

    Inputs
    ------
    X : pd.DataFrame
        Dataframe containing the features and label. Columns in `categorical_features`
    categorical_features: list[str]
        List containing the names of the categorical features (default=[])
    label : str
        Name of the label column in `X`. If None, then an empty array will be returned
        for y (default=None)
    training : bool
        Indicator if training mode or inference/validation mode.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained sklearn OneHotEncoder, only used if training=False.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained sklearn LabelBinarizer, only used if training=False.

    Returns
    -------
    X : np.array
        Processed data.
    y : np.array
        Processed labels if labeled=True, otherwise empty np.array.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
        in.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained LabelBinarizer if training is True, otherwise returns the binarizer
        passed in.
    """

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()

    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

In [70]:
train.reset_index()

Unnamed: 0,index,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,17078,24,private,172496,bachelors,13,never-married,sales,not-in-family,white,male,0,0,30,united-states,<=50k
1,26630,26,private,137658,bachelors,13,never-married,exec-managerial,not-in-family,white,female,0,0,45,united-states,<=50k
2,6728,32,private,127384,assoc-voc,11,married-civ-spouse,craft-repair,husband,white,male,0,0,55,united-states,>50k
3,15962,21,private,409230,hs-grad,9,never-married,craft-repair,not-in-family,white,male,0,0,40,guatemala,<=50k
4,8016,20,private,154779,some-college,10,never-married,sales,other-relative,other,female,0,0,40,united-states,<=50k
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,6036,40,private,170214,hs-grad,9,married-civ-spouse,transport-moving,husband,white,male,0,0,40,united-states,>50k
26044,26598,23,undefined,87569,some-college,10,separated,undefined,not-in-family,white,female,0,0,40,united-states,<=50k
26045,15807,24,private,224716,hs-grad,9,never-married,craft-repair,own-child,white,male,0,0,40,united-states,<=50k
26046,10341,23,private,154210,some-college,10,never-married,adm-clerical,other-relative,asian-pac-islander,male,0,0,14,puerto-rico,<=50k


In [71]:
train.iloc[26043].to_dict()

{'age': 40,
 'workclass': 'private',
 'fnlgt': 170214,
 'education': 'hs-grad',
 'education-num': 9,
 'marital-status': 'married-civ-spouse',
 'occupation': 'transport-moving',
 'relationship': 'husband',
 'race': 'white',
 'sex': 'male',
 'capital-gain': 0,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': 'united-states',
 'salary': '>50k'}

In [61]:
train.iloc[100].to_dict()

{'age': 43,
 'workclass': 'private',
 'fnlgt': 70055,
 'education': 'some-college',
 'education-num': 10,
 'marital-status': 'married-civ-spouse',
 'occupation': 'adm-clerical',
 'relationship': 'husband',
 'race': 'white',
 'sex': 'male',
 'capital-gain': 0,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': 'united-states',
 'salary': '<=50k'}

In [56]:
d = {k:[v] for k,v in train.iloc[0].to_dict().items()}
d = pd.DataFrame(d)

In [None]:
data_df = pd.DataFrame([data_dict])

In [41]:
import numpy as np

In [None]:
import joblib
encoder= joblib.load('model/encoder.pkl')


In [57]:
X_test, y_test, _, _ = process_data(
    d, categorical_features=cat_features,encoder=encoder, label="salary", training=False)


In [59]:
X_test.shape

(1, 14)

In [None]:
from fastapi.testclient import TestClient
from main import app

client = TestClient(app)

def test_read_main():
    response = client.get("/")
    assert response.status_code == 200

def test_predict_less_than_50k():
    response = client.post(
        "/predict/",
        json={
            "age": 30,
            "workclass": "private",
            "fnlgt": 141297,
            "education": "bachelors",
            "education_num": 13,
            "marital_status": "married-civ-spouse",
            "occupation": "prof-specialty",
            "relationship": "husband",
            "race": "asian-pac-islander",
            "sex": "male",
            "capital_gain": 0,
            "capital_loss": 0,
            "hours_per_week": 40,
            "native_country": "india",
        },
    )
    assert response.status_code == 200
    #assert response.json() == {"prediction": 0}  # Assuming this is the expected prediction


In [62]:
slice_feature = 'education'
categories = df[slice_feature].unique()

for category in categories:
    # Create mask for the category
    mask = df[slice_feature] == category

In [64]:
X_slice = X[mask]
y_slice = y[mask]

NameError: name 'X' is not defined