In [None]:
# app.py
import pandas as pd
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load dataset
data = pd.read_csv("data/adult.csv")

# Data preprocessing
data = data.replace(' ?', pd.NA)
data = data.dropna()

# Encode categorical variables
categorical_cols = data.select_dtypes(include='object').columns
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    le_dict[col] = le

# Split features and target
X = data.drop('income', axis=1)
y = data['income']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Streamlit UI
st.title("Predict Income Level (>50K or <=50K)")

def user_input_features():
    age = st.number_input('Age', 18, 90, 30)
    workclass = st.selectbox('Workclass', list(le_dict['workclass'].classes_))
    education = st.selectbox('Education', list(le_dict['education'].classes_))
    marital_status = st.selectbox('Marital Status', list(le_dict['marital-status'].classes_))
    occupation = st.selectbox('Occupation', list(le_dict['occupation'].classes_))
    relationship = st.selectbox('Relationship', list(le_dict['relationship'].classes_))
    race = st.selectbox('Race', list(le_dict['race'].classes_))
    sex = st.selectbox('Sex', list(le_dict['sex'].classes_))
    hours_per_week = st.number_input('Hours per week', 1, 99, 40)
    
    input_dict = {
        'age': age,
        'workclass': le_dict['workclass'].transform([workclass])[0],
        'education': le_dict['education'].transform([education])[0],
        'marital-status': le_dict['marital-status'].transform([marital_status])[0],
        'occupation': le_dict['occupation'].transform([occupation])[0],
        'relationship': le_dict['relationship'].transform([relationship])[0],
        'race': le_dict['race'].transform([race])[0],
        'sex': le_dict['sex'].transform([sex])[0],
        'hours-per-week': hours_per_week
    }
    return pd.DataFrame([input_dict])

input_df = user_input_features()

# Scale input
input_scaled = scaler.transform(input_df)

# Prediction
prediction = model.predict(input_scaled)
prediction_label = '>50K' if prediction[0] == 1 else '<=50K'

st.subheader("Predicted Income Level:")
st.write(prediction_label)
