In [2]:
# -----------------------------------------------------
# Salary Prediction Using Machine Learning
# -----------------------------------------------------

# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# 2. Load Dataset
df = pd.read_csv("/content/adult 3.csv")
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

# 3. Encode Categorical Columns
categorical_cols = df.select_dtypes(include='object').columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 4. Define Features and Target
X = df.drop("income", axis=1)
y = df["income"]

# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. Train Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n🔍 Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# 8. Save the Best Model and Scaler
best_model = models["Gradient Boosting"]
joblib.dump(best_model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\n✅ Model and scaler saved as 'model.pkl' and 'scaler.pkl'")



🔍 Model: Logistic Regression
Accuracy: 0.8211166390270868
Confusion Matrix:
 [[6443  399]
 [1219  984]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.94      0.89      6842
           1       0.71      0.45      0.55      2203

    accuracy                           0.82      9045
   macro avg       0.78      0.69      0.72      9045
weighted avg       0.81      0.82      0.81      9045


🔍 Model: Random Forest
Accuracy: 0.8569375345494749
Confusion Matrix:
 [[6367  475]
 [ 819 1384]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      6842
           1       0.74      0.63      0.68      2203

    accuracy                           0.86      9045
   macro avg       0.82      0.78      0.79      9045
weighted avg       0.85      0.86      0.85      9045


🔍 Model: Gradient Boosting
Accuracy: 0.8671088999447208
Confusion Matrix:
 [[6514  328]
 [ 87