In [None]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import pickle
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_decision_regions


In [None]:
df = pd.read_csv("adult 3.csv")
df.head(10)

In [None]:
df.shape

In [None]:
plt.boxplot(df['age'])
plt.show()

In [None]:
# Removing Outliers
df = df[(df['age']<=65) & (df['age']>=17)]

In [None]:
plt.boxplot(df['educational-num'])
plt.show()

In [None]:
# Removing Outliers
df = df[(df['educational-num']<=16) & (df['educational-num']>=5)]

In [None]:
# Feature Extraction : Adding Experience by Subtracting educational years and 6 years of childhood
df['experience'] = df['age'] - df['educational-num'] - 6

In [None]:
df.columns.tolist()         # income (label) should be last column

In [None]:
# Rearranging the columns
cols = df.columns.tolist()[:-1]
cols.insert(1,'experience')
df = df[cols]
df.columns.tolist()

In [None]:
for col in df.columns:
    if col not in df.select_dtypes(['number']).columns:
        print(df[col].value_counts())
        print('\n')

In [None]:
for col in ['workclass','native-country','occupation']:
    df[col].replace({'?':'Others'},inplace=True)

In [None]:
# Removing non impacting categories of 'workclass' and 'education'
df = df[df['workclass']!='Without-pay']
df = df[df['workclass']!='Never-worked']
df = df[df['education']!='Preschool']
df = df[df['education']!='1st-4th']
df = df[df['education']!='5th-6th']
df.shape

In [None]:
df.drop(columns=['education'],inplace=True)         # Since we have educational-num
df.drop(columns=['fnlwgt'],inplace=True)         # Since fnlwgt is a sampling weight and typically not very predictive.

In [None]:
df.to_csv("Employee_details.csv")

In [None]:
# Applying Label Encodeing
categorical_cols = ['workclass','marital-status','occupation','relationship','race','gender','native-country','income']

label_encoders = {}

for col in categorical_cols:
    lben = LabelEncoder()
    df[col] = lben.fit_transform(df[col])
    label_encoders[col] = lben

df

In [None]:
# Separating features and label
x = df.iloc[:,:-1]
y = df['income']

In [None]:
# Appling MinMax Scaling Technique
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x


In [None]:
# Save label encoders
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

# Save scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=23,stratify=y)
xtrain

In [None]:
# After testing various models, XGBoost comes up as Best Model for Employee Salary Prediction 

model = XGBClassifier(
    n_estimators = 100,
    learning_rate = 0.1,
    max_depth=6,
    eval_metric= 'logloss',
    random_state=42
)
model.fit(xtrain,ytrain)
predictxg = model.predict(xtest)
print('Accuracy of XGBoost: ',accuracy_score(ytest,predictxg))

In [None]:
import joblib 

joblib.dump(model,"model.pkl")

In [None]:
# for adding column names to the XGBoost Feature Importance graph
model.get_booster().feature_names = df.columns[:-1].tolist()


In [None]:
from xgboost import plot_importance

plt.figure(figsize=(10,6))
plot_importance(model,max_num_features=13)
plt.title("XGBoost Feature Importance")                 # Good for: Understanding which features influence predictions most.
plt.show()

In [None]:
df.shape
