In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectFromModel

In [29]:
# Load the dataset
data = pd.read_csv("adult_with_headers.csv")

In [30]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [31]:
data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [33]:
missing_values = data.isnull().sum()
print(missing_values)

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [44]:
# Apply Standard Scaling
scaler = StandardScaler()
data[['age', 'fnlwgt','education_num','capital_gain', 'capital_loss', 'hours_per_week']] = scaler.fit_transform(data[['age', 'fnlwgt','education_num', 'capital_gain', 'capital_loss', 'hours_per_week']])

In [49]:
# Use Label Encoding for all categorical variables
le = LabelEncoder()
categorical_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex','native_country','income']
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

In [50]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,age_squared,education_number_capital_gain_interaction
0,0.030671,7,-1.063611,9,1.134739,4,1,1,4,1,0.148453,-0.21666,-0.035429,39,0,0.000941,0.168455
1,0.837109,6,-1.008707,9,1.134739,2,4,0,4,1,-0.14592,-0.21666,-2.222153,39,0,0.700751,-0.165582
2,-0.042642,4,0.245079,11,-0.42006,0,6,1,4,1,-0.14592,-0.21666,-0.035429,39,0,0.001818,0.061295
3,1.057047,4,0.425801,1,-1.197459,2,6,0,2,1,-0.14592,-0.21666,-0.035429,39,0,1.117348,0.174734
4,-0.775768,4,1.408176,9,1.134739,2,10,5,2,0,-0.14592,-0.21666,-0.035429,5,0,0.601816,-0.165582


In [51]:
# Create new features
data['age_squared'] = data['age'] ** 2
data['education_number_capital_gain_interaction'] = data['education_num'] * data['capital_gain']

In [67]:
# Create a copy of the dataset to avoid modifying the original
data_copy = data.copy()

# Apply log transformation to 'capital-gain' on the copy
data_copy['capital_gain'] = data_copy['capital_gain'].apply(lambda x: np.log(x + 1))
print(data_copy['capital_gain'])


0        0.138416
1       -0.157731
2       -0.157731
3       -0.157731
4       -0.157731
           ...   
32556   -0.157731
32557   -0.157731
32558   -0.157731
32559   -0.157731
32560    1.060711
Name: capital_gain, Length: 31912, dtype: float64


In [53]:
# Identify outliers using Isolation Forest
clf = IsolationForest(contamination=0.01)  # Adjust contamination parameter as needed
y_pred = clf.fit_predict(data)
outliers = data[y_pred == -1]
data = data[y_pred == 1]



In [54]:
# Split data into features and target variable
X = data.drop('income', axis=1)
y = data['income']

In [55]:
# Create and train a model (e.g., Random Forest)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X, y)


In [57]:
# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X)

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

In [58]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:",f1)

Accuracy: 0.9999059914765606
Precision: 1.0
Recall: 0.9995986622073578
F1-score: 0.9997992908275908


In [60]:
pip install ppscore

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [64]:
print(dir(ppscore))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'calculation', 'dist_name', 'matrix', 'predictors', 'score']


In [69]:
import pandas as pd
import ppscore as pps

# Calculate PPS matrix
pps_matrix = ppscore.calculate(X, y)

# Print the matrix
print(pps_matrix)

AttributeError: module 'ppscore' has no attribute 'calculate'