In [130]:
import pandas as pd
import numpy as np
import aif360
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load data
df = pd.read_csv('compas-scores-two-years.csv')


In [131]:
df = df.drop(['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'age_cat', 
              'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count.1', 
              'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 
              'c_offense_date',"r_case_number", "r_charge_degree",  'c_arrest_date', 
              'c_days_from_compas', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc',
              'r_jail_in', 'r_jail_out', 'violent_recid', 'vr_case_number', 
              'vr_charge_degree', 'v_screening_date', 'in_custody', 'out_custody', 
              "vr_offense_date", 'vr_charge_desc', 'screening_date', 'start', 'end', 
              'c_charge_desc', "v_type_of_assessment", "type_of_assessment", 
              'is_recid', 'is_violent_recid'], axis=1)





In [132]:
df['sex'] = df['sex'].replace({'Male': 1, 'Female': 0})
df['race'] = df['race'].replace({'Caucasian': 1, 'African-American': 0})
df['c_charge_degree'] = df['c_charge_degree'].replace({'F': 1, 'M': 0})
df = df[(df['race'] == 0) | (df['race'] == 1)]
df = df.dropna()
df = df.reset_index(drop=True)

In [133]:
df

Unnamed: 0,sex,age,race,decile_score,priors_count,c_charge_degree,decile_score.1,score_text,v_decile_score,v_score_text,event,two_year_recid
0,1,34,0,3,0,1,3,Low,1,Low,1,1
1,1,24,0,4,4,1,4,Low,3,Low,0,1
2,1,23,0,8,1,1,8,High,6,Medium,0,0
3,1,41,1,6,14,1,6,Medium,2,Low,1,1
4,0,39,1,1,0,0,1,Low,1,Low,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6145,1,30,0,2,0,0,2,Low,2,Low,1,1
6146,1,20,0,9,0,1,9,High,9,High,0,0
6147,1,23,0,7,0,1,7,Medium,5,Medium,0,0
6148,1,23,0,3,0,1,3,Low,5,Medium,0,0


In [134]:
from sklearn.preprocessing import LabelEncoder

# define the list of column names that you want to encode
string_cols = [ 'score_text', 'v_score_text']

# create a LabelEncoder object
le = LabelEncoder()

# iterate over the string columns and encode their values
for col in string_cols:
    df[col] = le.fit_transform(df[col])

In [135]:
df.head(5)

Unnamed: 0,sex,age,race,decile_score,priors_count,c_charge_degree,decile_score.1,score_text,v_decile_score,v_score_text,event,two_year_recid
0,1,34,0,3,0,1,3,1,1,1,1,1
1,1,24,0,4,4,1,4,1,3,1,0,1
2,1,23,0,8,1,1,8,0,6,2,0,0
3,1,41,1,6,14,1,6,2,2,1,1,1
4,0,39,1,1,0,0,1,1,1,1,0,0


In [136]:
df['event'].max()

1

In [138]:
data = df
# Split data into features and target
X = data.drop('two_year_recid', axis=1)
y = data['two_year_recid']

# One-hot encode categorical variables
cat_cols = ['sex', 'race', 'c_charge_degree', 'score_text', 
            'v_score_text', 'event']
enc = OneHotEncoder(handle_unknown='ignore')
X_cat = enc.fit_transform(X[cat_cols]).toarray()

# Normalize numerical variables
num_cols = ['age', 'decile_score', 'priors_count', 'decile_score.1', 'v_decile_score']
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# Combine categorical and numerical features
X = np.concatenate((X_num, X_cat), axis=1)

# Balance dataset
# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save training and testing data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

feature_names = num_cols + enc.get_feature_names_out().tolist()
np.save("feature_names.npy", np.array(feature_names))

In [145]:
feature_names[6]

'sex_1'

In [141]:
X_train[0].shape

(19,)