In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold, chi2, f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [5]:
df = pd.read_csv("employment_income_dataset.csv")
df = df.dropna(how='any',axis=0) 

display(df.describe())
df.info()
df.head()

Unnamed: 0,employee_id,age,years_experience,training_hours,certifications_count,performance_rating,manager_feedback_score,work_life_balance_score,salary_satisfaction,annual_income_usd,bonus_percent,commute_time_minutes,overtime_hours_monthly,job_satisfaction_score
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,42.694,19.5575,20.4175,1.067,3.489,5.3688,5.47055,5.5,70351.5175,9.8547,30.087,4.945,4.4311
std,577.494589,12.614195,11.438799,9.910919,1.055494,0.852592,2.580672,2.570849,2.632458,19690.991216,5.813682,10.105914,2.271245,2.1683
min,1.0,21.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,30000.0,0.0,0.0,0.0,1.0
25%,500.75,32.0,10.0,13.0,0.0,2.8,3.1,3.3,3.2,56213.75,4.7,23.0,3.0,2.8
50%,1000.5,43.0,20.0,21.0,1.0,3.5,5.4,5.5,5.5,70092.5,9.9,30.0,5.0,4.4
75%,1500.25,54.0,29.0,27.0,2.0,4.2,7.5,7.7,7.8,83592.75,14.9,37.0,6.0,5.9
max,2000.0,64.0,39.0,53.0,7.0,5.0,10.0,10.0,10.0,132801.0,20.0,57.0,17.0,10.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   employee_id              2000 non-null   int64  
 1   age                      2000 non-null   int64  
 2   years_experience         2000 non-null   int64  
 3   education_level          2000 non-null   object 
 4   department               2000 non-null   object 
 5   company_size             2000 non-null   object 
 6   region                   2000 non-null   object 
 7   gender                   2000 non-null   object 
 8   ethnicity                2000 non-null   object 
 9   marital_status           2000 non-null   object 
 10  remote_status            2000 non-null   object 
 11  training_hours           2000 non-null   float64
 12  certifications_count     2000 non-null   int64  
 13  job_level                2000 non-null   object 
 14  performance_rating      

Unnamed: 0,employee_id,age,years_experience,education_level,department,company_size,region,gender,ethnicity,marital_status,...,manager_feedback_score,work_life_balance_score,salary_satisfaction,annual_income_usd,bonus_percent,commute_time_minutes,overtime_hours_monthly,job_satisfaction_score,promoted_next_year,left_company
0,1,59,18,bachelor,HR,small,south,female,hispanic,married,...,1.9,8.4,9.7,58066.0,1.7,33.0,9,6.5,no,no
1,2,49,10,bachelor,IT,medium,north,female,other,married,...,8.8,6.2,9.4,91402.0,13.5,30.0,5,6.6,no,no
2,3,35,0,bachelor,Finance,small,north,female,black,married,...,3.7,5.2,5.3,89372.0,11.6,40.0,5,5.4,no,no
3,4,63,23,bachelor,HR,medium,west,male,other,married,...,7.3,6.8,7.8,54380.0,5.8,48.0,2,4.2,no,no
4,5,28,6,bachelor,Finance,small,west,female,white,single,...,8.4,5.2,7.0,70198.0,15.6,37.0,6,2.5,no,yes


In [13]:
# Variance Threshold

# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Remove features with variance < 0.01
var_thresh = VarianceThreshold(threshold=0.01)
var_thresh.fit(numeric_df)

low_variance_features = numeric_df.columns[~var_thresh.get_support()]
print("Low variance features:", low_variance_features.tolist())

Low variance features: []


In [15]:
# Chi Squared Test

# Encode categorical variables numerically
cat_df = df.select_dtypes(include='object').apply(LabelEncoder().fit_transform)

# Encode target
target = LabelEncoder().fit_transform(df['left_company'])

# Apply Chi-square test
chi_scores, p_values = chi2(cat_df, target)

chi_results = pd.DataFrame({
    'feature': cat_df.columns,
    'chi2': chi_scores,
    'p_value': p_values
}).sort_values('p_value')

display(chi_results)


Unnamed: 0,feature,chi2,p_value
10,left_company,858.0,1.3254370000000001e-188
9,promoted_next_year,3.563878,0.05904965
5,ethnicity,3.000692,0.08322894
8,job_level,1.387402,0.2388444
1,department,0.831565,0.3618203
4,gender,0.690467,0.4060057
6,marital_status,0.362329,0.5472154
2,company_size,0.281476,0.5957357
7,remote_status,0.190709,0.6623273
0,education_level,0.047526,0.8274259


In [None]:
# One Way ANOVA F-Test

# Scale numeric data
X_num = MinMaxScaler().fit_transform(df.select_dtypes(include=np.number))
y = LabelEncoder().fit_transform(df['left_company'])

f_scores, p_vals = f_classif(X_num, y)

anova_results = pd.DataFrame({
    'feature': df.select_dtypes(include=np.number).columns,
    'F_score': f_scores,
    'p_value': p_vals
}).sort_values('p_value')

# Higher F_score and lower p_value = stronger relationship between numeric feature and target
display(anova_results)

Unnamed: 0,feature,F_score,p_value
13,job_satisfaction_score,1585.145754,1.030601e-255
7,work_life_balance_score,984.610025,4.62196e-176
6,manager_feedback_score,36.935288,1.4586e-09
8,salary_satisfaction,24.807738,6.878657e-07
11,commute_time_minutes,11.524011,0.0007004582
3,training_hours,2.944036,0.08635013
4,certifications_count,1.487078,0.2228144
2,years_experience,1.306123,0.2532347
9,annual_income_usd,0.942828,0.331669
1,age,0.924431,0.3364306


In [18]:
# Correlation and Mutual Information

corr = df.corr(numeric_only=True)['annual_income_usd'].sort_values(ascending=False)
display(corr)

# Encode categorical columns
X = pd.get_dummies(df.drop(columns=['left_company']))
y = LabelEncoder().fit_transform(df['left_company'])

mi = mutual_info_classif(X, y, discrete_features='auto')
mi_results = pd.Series(mi, index=X.columns).sort_values(ascending=False)

print(mi_results.head(10))

annual_income_usd          1.000000
performance_rating         0.024109
work_life_balance_score    0.021386
age                        0.017088
manager_feedback_score    -0.003874
years_experience          -0.004255
employee_id               -0.008201
bonus_percent             -0.011552
certifications_count      -0.014268
commute_time_minutes      -0.020349
training_hours            -0.029361
job_satisfaction_score    -0.036008
overtime_hours_monthly    -0.036659
salary_satisfaction       -0.057906
Name: annual_income_usd, dtype: float64

job_satisfaction_score     0.374155
work_life_balance_score    0.250089
manager_feedback_score     0.020286
department_Sales           0.019117
annual_income_usd          0.017415
certifications_count       0.017232
marital_status_divorced    0.016443
ethnicity_asian            0.016311
education_level_master     0.007860
marital_status_married     0.007035
dtype: float64
