In [82]:
import pandas as pd
import numpy as np
import aif360
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load data
df = pd.read_csv('law_data_clean.csv')


In [83]:
df

Unnamed: 0,decile1b,decile3,lsat,ugpa,zfygpa,zgpa,fulltime,fam_inc,male,pass_bar,tier,racetxt
0,10.0,10.0,44.0,3.5,1.33,1.88,1.0,5.0,0.0,1.0,4.0,White
1,5.0,4.0,29.0,3.5,-0.11,-0.57,1.0,4.0,0.0,1.0,2.0,White
2,8.0,7.0,37.0,3.4,0.63,0.37,1.0,3.0,1.0,1.0,4.0,White
3,8.0,7.0,43.0,3.3,0.67,0.34,1.0,4.0,0.0,1.0,4.0,White
4,3.0,2.0,41.0,3.3,-0.67,-1.30,1.0,4.0,0.0,1.0,5.0,White
...,...,...,...,...,...,...,...,...,...,...,...,...
20793,9.0,8.0,42.0,3.0,1.19,0.60,1.0,4.0,1.0,1.0,6.0,White
20794,3.0,9.0,29.5,3.5,-0.45,1.18,1.0,4.0,1.0,1.0,3.0,White
20795,1.0,1.0,33.0,3.1,-1.92,-1.50,1.0,3.0,1.0,0.0,3.0,Black
20796,4.0,5.0,32.0,3.0,-0.37,-0.16,2.0,3.0,1.0,1.0,3.0,White


In [84]:
df['racetxt'] = df['racetxt'].replace({'White': 1, 'Black': 0})
df = df[(df['racetxt'] == 0) | (df['racetxt'] == 1)]
df = df.dropna()
df = df.reset_index(drop=True)

In [85]:
df

Unnamed: 0,decile1b,decile3,lsat,ugpa,zfygpa,zgpa,fulltime,fam_inc,male,pass_bar,tier,racetxt
0,10.0,10.0,44.0,3.5,1.33,1.88,1.0,5.0,0.0,1.0,4.0,1
1,5.0,4.0,29.0,3.5,-0.11,-0.57,1.0,4.0,0.0,1.0,2.0,1
2,8.0,7.0,37.0,3.4,0.63,0.37,1.0,3.0,1.0,1.0,4.0,1
3,8.0,7.0,43.0,3.3,0.67,0.34,1.0,4.0,0.0,1.0,4.0,1
4,3.0,2.0,41.0,3.3,-0.67,-1.30,1.0,4.0,0.0,1.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
18687,9.0,9.0,35.0,3.2,1.21,1.29,1.0,4.0,1.0,1.0,2.0,1
18688,9.0,8.0,42.0,3.0,1.19,0.60,1.0,4.0,1.0,1.0,6.0,1
18689,3.0,9.0,29.5,3.5,-0.45,1.18,1.0,4.0,1.0,1.0,3.0,1
18690,1.0,1.0,33.0,3.1,-1.92,-1.50,1.0,3.0,1.0,0.0,3.0,0


In [86]:
# Get the number of samples with male value 1
num_male_1 = len(df[df['male'] == 1])

# Get the indices of samples with male value 0
female_indices = df[df['male'] == 0].index

# Downsample the female samples
female_downsampled = df.loc[female_indices].sample(n=int(len(female_indices) * 0.5), replace=False, random_state=42)

# Concatenate the downsampled female samples with the male samples
df_downsampled = pd.concat([df[df['male'] == 1], female_downsampled])

# Shuffle the data
df = df_downsampled.sample(frac=1, random_state=42)

In [87]:
df

Unnamed: 0,decile1b,decile3,lsat,ugpa,zfygpa,zgpa,fulltime,fam_inc,male,pass_bar,tier,racetxt
6778,10.0,10.0,41.0,3.1,1.65,1.80,1.0,4.0,1.0,1.0,3.0,1
7244,9.0,9.0,40.0,3.3,1.05,1.13,1.0,3.0,1.0,1.0,4.0,1
669,4.0,4.0,48.0,2.4,-0.45,-0.44,1.0,3.0,1.0,1.0,5.0,1
13343,2.0,1.0,39.0,2.9,-1.03,-1.73,1.0,3.0,0.0,0.0,3.0,1
2804,1.0,1.0,23.3,2.7,-1.56,-1.95,1.0,3.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9247,2.0,2.0,36.0,3.2,-1.05,-1.06,1.0,4.0,1.0,1.0,4.0,1
1046,4.0,2.0,28.0,2.7,-0.43,-1.06,2.0,2.0,0.0,0.0,5.0,0
9563,2.0,3.0,38.0,2.5,-0.91,-0.81,1.0,4.0,1.0,1.0,3.0,1
1549,5.0,5.0,34.0,2.3,-0.05,-0.18,1.0,5.0,1.0,1.0,3.0,1


In [88]:
data = df
# Split data into features and target
X = data.drop('pass_bar', axis=1)
y = data['pass_bar']

# One-hot encode categorical variables
cat_cols = ['fulltime', 'male', 'racetxt']
enc = OneHotEncoder(handle_unknown='ignore')
X_cat = enc.fit_transform(X[cat_cols]).toarray()

# Normalize numerical variables
num_cols = ['decile1b', 'decile3', 'lsat', 'ugpa', 'zfygpa', 'zgpa', 'fam_inc', 'tier']
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# Combine categorical and numerical features
X = np.concatenate((X_num, X_cat), axis=1)


# ros = RandomOverSampler(sampling_strategy='minority')
# X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# Save training and testing data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

feature_names = num_cols + enc.get_feature_names_out().tolist()
np.save("feature_names.npy", np.array(feature_names))

In [62]:
np.sum(y_resampled)

16856.0

In [63]:
X_train.shape

(26969, 14)

In [64]:
X_test.shape

(6743, 14)

In [69]:
df['male'].sum()

10550.0