In [26]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [27]:
df=pd.read_csv('income_evaluation.csv')
display(df.shape)
# Droping duplicates
df=df.drop_duplicates()
df[df.duplicated()]
df.head(3)
#Removing leading/trailing space and replacing in between space with under score.
df.columns = (df.columns.str.strip().str.lower())

# Replace '?' with mode in workclass and native-country
col='workclass'
mode_value = df[col].mode()[0]
df[col] = df[col].replace(' ?', mode_value)

col='native-country'
mode_value = df[col].mode()[0]
df[col] = df[col].replace(' ?', mode_value)

col='income'
#mode_value = df[col].mode()[0]
df[col] = df[col].replace(' >50K', 1)
df[col] = df[col].replace(' <=50K', 0)

(32561, 15)

In [28]:
# Categorical to categorical variables relationship.
from scipy.stats import chi2_contingency
# Chi-square for cat-to-cat column relationship
def find_categorical_columns_relation(df, col1, col2):
    confusion_matrix=pd.crosstab(df[col1], df[col2])
    chi2, p, dof, expected = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    min_dim = min(confusion_matrix.shape) - 1
    cramers_v = np.sqrt(chi2 / (n * min_dim))
    #print('col1:', col1, ',col2:', col2, ',Cramers V:', cramers_v)
    return cramers_v # 0 means no relation and 1 means strong relation. It does not show direction of relation (positive or negative)
    
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country','income']
#find_categorical_columns_relation(df,'workclass','income')
def already_computed(col1, col2):
    if (col1, col2) in computed or (col2, col1) in computed:
        return False
    else:
        computed.add((col1, col2))
        return True
lst=[]
computed=set()
for cat_col1 in categorical_features:
    for cat_col2 in categorical_features:
        if cat_col1 != cat_col2 and already_computed(cat_col1, cat_col2):
            cramers_v=find_categorical_columns_relation(df, cat_col1, cat_col2)
            lst.append((cat_col1, cat_col2, cramers_v))
sorted_lst=sorted(lst, key=lambda x: x[2], reverse=True)
for item in sorted_lst:
    print(item)

('relationship', 'sex', 0.6491087979307012)
('marital-status', 'relationship', 0.4879663976816214)
('marital-status', 'sex', 0.461807182455745)
('relationship', 'income', 0.45351579634155065)
('marital-status', 'income', 0.447314337676062)
('occupation', 'sex', 0.4241770389935278)
('race', 'native-country', 0.40543522435177093)
('education', 'income', 0.3689222912649586)
('occupation', 'income', 0.3519451962095364)
('sex', 'income', 0.2158927904925206)
('workclass', 'occupation', 0.20747803400515535)
('education', 'occupation', 0.1870702466250539)
('occupation', 'relationship', 0.1786183604154298)
('workclass', 'income', 0.16836781914218513)
('workclass', 'sex', 0.14308616538598945)
('marital-status', 'occupation', 0.1331805085605658)
('education', 'native-country', 0.13071014925696528)
('education', 'relationship', 0.12258989249983518)
('race', 'sex', 0.11837905460780355)
('workclass', 'education', 0.10179590268913362)
('race', 'income', 0.1008528343491578)
('native-country', 'income'

In [29]:
# Numerical columns correlation with categorical columns
def correlation_ratio(df, cat_col, num_col):
    categories = np.array(df[cat_col])
    values = np.array(df[num_col])

    cat_means = [
        values[categories == cat].mean()
        for cat in np.unique(categories)
    ]

    grand_mean = values.mean()
    numerator = sum(
        len(values[categories == cat]) * (mean - grand_mean) ** 2
        for cat, mean in zip(np.unique(categories), cat_means)
    )
    denominator = sum((values - grand_mean) ** 2)

    return numerator / denominator # 0 means no relation and 1 means strong relation. It does not show direction of relation (positive or negative

# eta_sq = correlation_ratio(df,'education', 'education-num')
# print(eta_sq)
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country','income']
numerical_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
lst=[]
for cat_col in categorical_features:
    for num_col in numerical_features:
        eta_sq=correlation_ratio(df, cat_col, num_col)
        lst.append((cat_col, num_col, eta_sq))

sorted_lst=sorted(lst, key=lambda x: x[2], reverse=True)
for item in sorted_lst:
    print(item)

('education', 'education-num', 1.000000000000129)
('marital-status', 'age', 0.3290184100935994)
('occupation', 'education-num', 0.30968754778974134)
('relationship', 'age', 0.22487032262909384)
('income', 'education-num', 0.11240698692569837)
('occupation', 'hours-per-week', 0.09705707162893941)
('relationship', 'hours-per-week', 0.09599269895157475)
('native-country', 'education-num', 0.0757959202457202)
('marital-status', 'hours-per-week', 0.06277137535617197)
('income', 'age', 0.054773361669666995)
('income', 'hours-per-week', 0.052742843648214396)
('education', 'age', 0.05267952052142049)
('sex', 'hours-per-week', 0.05252673379035869)
('income', 'capital-gain', 0.04987919175002811)
('workclass', 'age', 0.04279804280575228)
('education', 'capital-gain', 0.03914931537864205)
('education', 'hours-per-week', 0.03771232059562653)
('workclass', 'education-num', 0.03590072571671229)
('occupation', 'age', 0.034904011458088066)
('workclass', 'hours-per-week', 0.02822623957581907)
('relation

In [30]:
cols=['age','fnlwgt','capital-gain','capital-loss','hours-per-week']


def list_outliers_quantile(df, col, lower_quantile, upper_quantile):
    #display('col=',col)
    lower = df[col].quantile(lower_quantile)
    upper = df[col].quantile(upper_quantile)
    df_outliers=df[(df[col]<lower) | (df[col]>upper)]
    outlier_count=df_outliers.shape[0]
    total_count=df.shape[0]
    ratio=outlier_count/total_count
    #print('lower_quantile=',lower,' upper_quantile=',upper,'df_outliers.count_quantile=',outlier_count,'total records_quantile=',total_count,' ratio_quantile=',ratio)
    #display(df_outliers)
    df[col] = df[col].clip(lower, upper)

for col in ['age','fnlwgt','capital-gain','capital-loss','hours-per-week']:
    list_outliers_quantile(df,col,.01,.99)

In [31]:
# SPlitting into X and Y
# education is highly correlated with education-num
X=df.drop(columns=['income','education'], axis=1)

Y=df['income']
str_cat_cols=['workclass','marital-status','occupation','relationship', 'race', 'sex', 'native-country']
num_cat_cols=['education-num']
pure_num_cols=['fnlwgt','capital-gain', 'capital-loss','age','hours-per-week']


In [33]:
# Calculate vif. Features are not highly correlated.
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X_numeric=X[pure_num_cols]
X_intercept = add_constant(X_numeric)

vif_df=pd.DataFrame()
vif_df['Features']=X_intercept.columns

vif_df['VIF'] = [
    variance_inflation_factor(X_intercept.values, i)
    for i in range(X_intercept.shape[1])
]
display(vif_df.sort_values(by='VIF', ascending=False))

Unnamed: 0,Features,VIF
0,const,24.122326
2,capital-gain,1.030405
4,age,1.029572
5,hours-per-week,1.018965
3,capital-loss,1.010252
1,fnlwgt,1.006324


In [34]:
# Train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7, stratify=Y)

In [35]:
# Feature encoding categorical columns
from category_encoders import TargetEncoder
te = TargetEncoder(cols=str_cat_cols,smoothing=10)

te.fit(X_train, pd.Series(Y_train))
X_train=te.transform(X_train)
X_test=te.transform(X_test)

In [36]:
# Find skewness:
skewness = df[pure_num_cols].skew()
skewness

fnlwgt            0.799468
capital-gain      4.569087
capital-loss      4.406497
age               0.481958
hours-per-week    0.011491
dtype: float64

In [37]:
# Treating skewness
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson', standardize=False)
cols=["capital-gain", "capital-loss"]

pt.fit(X_train[cols])
X_train[cols]=pt.transform(X_train[cols])
X_test[cols]=pt.transform(X_test[cols])
X_test.head(3)

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
8860,56,0.209911,95763.0,6,0.104326,0.227617,0.1052,0.255895,0.305445,-0.0,-0.0,45,0.246697
15982,44,0.209911,205706.0,9,0.047426,0.484518,0.1052,0.255895,0.109824,-0.0,-0.0,40,0.246697
15753,38,0.209911,103323.0,9,0.444772,0.484518,0.447065,0.255895,0.305445,-0.0,-0.0,40,0.246697


In [38]:
# Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train[pure_num_cols])

def scale(input_df,cols):
    tmp_df=pd.DataFrame(scaler.transform(input_df[cols]), columns=cols, index=input_df.index)
    input_df[cols]=tmp_df[cols]
    return tmp_df

X_train[pure_num_cols]=scale(X_train, pure_num_cols)
X_test[pure_num_cols]=scale(X_test, pure_num_cols)

In [39]:
# Treating class impbalance by increasing minority class number using smotetomek method
from imblearn.combine import SMOTETomek
smt = SMOTETomek(random_state=7)
X_train, Y_train = smt.fit_resample(X_train, Y_train)

In [40]:
Y_train.value_counts()

income
0    16851
1    16851
Name: count, dtype: int64

In [41]:
# Decision Tree
#Decision Tree
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
dt = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.0, 0.05, 0.1]
}

grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,                        
    scoring='neg_mean_squared_error',
    n_jobs=3)

# X_train=X_train[['workclass','marital-status','occupation','relationship', 'race', 'sex', 'native-country',
#      'fnlwgt','capital-gain', 'capital-loss','age','education-num']] #Reorder columns so that it is easy to pass columns in correct order in streamlit.
# X_test=X_test[['workclass','marital-status','occupation','relationship', 'race', 'sex', 'native-country',
#      'fnlwgt','capital-gain', 'capital-loss','age','education-num']] #Reorder columns so that it is easy to pass columns in correct order in streamlit.

grid_search.fit(X_train, Y_train)

display("Best Parameters:", grid_search.best_params_)
display("Best CV Score:", grid_search.best_score_)

best_dt = grid_search.best_estimator_
Y_pred = best_dt.predict(X_test)
print("Accuracy: test", accuracy_score(Y_test, Y_pred))
print("\nConfusion Matrix:test\n", confusion_matrix(Y_test, Y_pred))
print("\nClassification Report:test\n", classification_report(Y_test, Y_pred))

Y_pred = best_dt.predict(X_train)
print("Accuracy:train", accuracy_score(Y_train, Y_pred))
print("\nConfusion Matrix:train\n", confusion_matrix(Y_train, Y_pred))
print("\nClassification Report:train\n", classification_report(Y_train, Y_pred))

'Best Parameters:'

{'ccp_alpha': 0.0,
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

'Best CV Score:'

-0.15171016900432582

Accuracy: test 0.8137676705593117

Confusion Matrix:test
 [[6305 1105]
 [ 713 1639]]

Classification Report:test
               precision    recall  f1-score   support

           0       0.90      0.85      0.87      7410
           1       0.60      0.70      0.64      2352

    accuracy                           0.81      9762
   macro avg       0.75      0.77      0.76      9762
weighted avg       0.83      0.81      0.82      9762

Accuracy:train 0.85137380570886

Confusion Matrix:train
 [[14391  2460]
 [ 2549 14302]]

Classification Report:train
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     16851
           1       0.85      0.85      0.85     16851

    accuracy                           0.85     33702
   macro avg       0.85      0.85      0.85     33702
weighted avg       0.85      0.85      0.85     33702



In [None]:
import joblib
joblib.dump(dt, 'dt_classifier_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(te, 'target_encoder.pkl')
joblib.dump(pt, 'power_transformer_skewness_handler.pkl')


['power_transformer_skewness_handler.pkl']

In [155]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=2) 

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}  

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=3
) 
grid_search.fit(X_train, Y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)
best_rf = grid_search.best_estimator_

Y_pred=best_rf.predict(X_test)
print("Accuracy: test", accuracy_score(Y_test, Y_pred))
print("\nConfusion Matrix:test\n", confusion_matrix(Y_test, Y_pred))
print("\nClassification Report:test\n", classification_report(Y_test, Y_pred))

Y_pred=best_rf.predict(X_train)
print("Accuracy:train", accuracy_score(Y_train, Y_pred))
print("\nConfusion Matrix:train\n", confusion_matrix(Y_train, Y_pred))
print("\nClassification Report:train\n", classification_report(Y_train, Y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Score: 0.6160608682716167
Accuracy: test 0.847060028682647

Confusion Matrix:test
 [[6720  690]
 [ 803 1549]]

Classification Report:test
               precision    recall  f1-score   support

           0       0.89      0.91      0.90      7410
           1       0.69      0.66      0.67      2352

    accuracy                           0.85      9762
   macro avg       0.79      0.78      0.79      9762
weighted avg       0.84      0.85      0.85      9762

Accuracy:train 1.0

Confusion Matrix:train
 [[16851     0]
 [    0 16851]]

Classification Report:train
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     16851
           1       1.00      1.00      1.00     16851

    accuracy                           1.00     33702
   macro avg       1.00      1.00      1.00     33702
weighted avg       1.00      1.00      1.00 

In [156]:
#XGBoost
from xgboost import XGBClassifier
xgbclassifier=XGBClassifier(n_estimators=100, max_depth=3,learning_rate=0.1)
xgbclassifier.fit(X_train, Y_train)

Y_pred=xgbclassifier.predict(X_test)
print("Accuracy: test", accuracy_score(Y_test, Y_pred))
print("\nConfusion Matrix:test\n", confusion_matrix(Y_test, Y_pred))
print("\nClassification Report:test\n", classification_report(Y_test, Y_pred))

Y_pred=xgbclassifier.predict(X_train)
print("Accuracy:train", accuracy_score(Y_train, Y_pred))
print("\nConfusion Matrix:train\n", confusion_matrix(Y_train, Y_pred))
print("\nClassification Report:train\n", classification_report(Y_train, Y_pred))

Accuracy: test 0.8329235812333539

Confusion Matrix:test
 [[6319 1091]
 [ 540 1812]]

Classification Report:test
               precision    recall  f1-score   support

           0       0.92      0.85      0.89      7410
           1       0.62      0.77      0.69      2352

    accuracy                           0.83      9762
   macro avg       0.77      0.81      0.79      9762
weighted avg       0.85      0.83      0.84      9762

Accuracy:train 0.8916681502581449

Confusion Matrix:train
 [[14621  2230]
 [ 1421 15430]]

Classification Report:train
               precision    recall  f1-score   support

           0       0.91      0.87      0.89     16851
           1       0.87      0.92      0.89     16851

    accuracy                           0.89     33702
   macro avg       0.89      0.89      0.89     33702
weighted avg       0.89      0.89      0.89     33702



In [157]:
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=50,
    random_seed=42
)
catboost_model.fit(X_train, Y_train, eval_set=(X_test, Y_test), use_best_model=True)

Y_pred=xgbclassifier.predict(X_test)
print("Accuracy: test", accuracy_score(Y_test, Y_pred))
print("\nConfusion Matrix:test\n", confusion_matrix(Y_test, Y_pred))
print("\nClassification Report:test\n", classification_report(Y_test, Y_pred))

Y_pred=xgbclassifier.predict(X_train)
print("Accuracy:train", accuracy_score(Y_train, Y_pred))
print("\nConfusion Matrix:train\n", confusion_matrix(Y_train, Y_pred))
print("\nClassification Report:train\n", classification_report(Y_train, Y_pred))


0:	test: 0.8849094	best: 0.8849094 (0)	total: 14.4ms	remaining: 4.31s
50:	test: 0.9110222	best: 0.9110222 (50)	total: 497ms	remaining: 2.43s
100:	test: 0.9151514	best: 0.9151514 (100)	total: 1.01s	remaining: 1.99s
150:	test: 0.9160338	best: 0.9160338 (150)	total: 1.49s	remaining: 1.47s
200:	test: 0.9172147	best: 0.9172235 (194)	total: 2.13s	remaining: 1.05s
250:	test: 0.9175506	best: 0.9176583 (235)	total: 2.72s	remaining: 530ms
299:	test: 0.9178206	best: 0.9179634 (288)	total: 3.26s	remaining: 0us

bestTest = 0.9179634354
bestIteration = 288

Shrink model to first 289 iterations.
Accuracy: test 0.8329235812333539

Confusion Matrix:test
 [[6319 1091]
 [ 540 1812]]

Classification Report:test
               precision    recall  f1-score   support

           0       0.92      0.85      0.89      7410
           1       0.62      0.77      0.69      2352

    accuracy                           0.83      9762
   macro avg       0.77      0.81      0.79      9762
weighted avg       0.85   

In [None]:
joblib.dump(xgbclassifier, 'xgbclassifier_classifier_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(te, 'target_encoder.pkl')
joblib.dump(pt, 'power_transformer_skewness_handler.pkl')