In [7]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer

# Load the payroll data
df = pd.read_csv('../sample_files/payroll.csv')

# Preprocess the data
# Convert categorical data to numerical data
df['country_code'] = df['country_code'].map({'USA': 0, 'India': 1})

# Handle missing values by filling them with the mean of the column
imputer = SimpleImputer(strategy='mean')
df[['paygrade', 'country_code', 'payamount']] = imputer.fit_transform(df[['paygrade', 'country_code', 'payamount']])

# Select features for anomaly detection
features = ['paygrade', 'country_code', 'payamount']

# Train the Isolation Forest model
model = IsolationForest(contamination=0.05, random_state=42)
df['anomaly'] = model.fit_predict(df[features])

# Anomalies are indicated by -1, normal data by 1
anomalies = df[df['anomaly'] == -1]
normal_data = df[df['anomaly'] == 1]

print("Anomalous data points:")
print(anomalies)

# Analyze why the data points are considered anomalous
for feature in features:
    print(f"\nAnalysis for {feature}:")
    print(f"Mean of normal data: {normal_data[feature].mean()}")
    print(f"Mean of anomalous data: {anomalies[feature].mean()}")
    print(f"Standard deviation of normal data: {normal_data[feature].std()}")
    print(f"Standard deviation of anomalous data: {anomalies[feature].std()}")

Anomalous data points:
    emp_id  paygrade  country_code  payamount     paydate  anomaly
41    E005       5.0           0.0     8800.0  12/09/2024       -1
75    E008       4.0           1.0   150000.0  02/03/2025       -1
92    E010       5.0           1.0   160000.0  12/23/2024       -1
113   E012       3.0           1.0    88000.0  01/06/2025       -1
134   E014       1.0           1.0    44000.0  01/20/2025       -1
135   E014       1.0           1.0    49000.0  02/03/2025       -1
143   E015       5.0           0.0    12000.0  01/06/2025       -1
186   E019       1.0           0.0     1600.0  02/17/2025       -1

Analysis for paygrade:
Mean of normal data: 2.9947916666666665
Mean of anomalous data: 3.125
Standard deviation of normal data: 1.4011864674551373
Standard deviation of anomalous data: 1.8850918886280925

Analysis for country_code:
Mean of normal data: 0.4947916666666667
Mean of anomalous data: 0.625
Standard deviation of normal data: 0.5012799933796008
Standard deviatio