In [None]:
import pandas as pd
import numpy   as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import multiprocessing as mp
import pickle
import math
from sklearn.neural_network import MLPRegressor
from Data_info import *
from math import pi, sqrt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)


In [None]:
# Metric is Demographic Parity (DP1). Code source https://github.com/steven7woo/fair_regression_reduction/tree/master based on: Fair Regression: Quantitative Definitions and Reduction-based Algorithms https://arxiv.org/abs/1905.12843

def get_histogram(pred, theta_indices):
    hist, _ = np.histogram(pred, bins=np.append(theta_indices, theta_indices.iloc[-1] + 1))
    return pd.Series(hist, index=theta_indices)

def calc_p1(pred, sensitive_features):
    Theta = np.unique(pred)  
    theta_indices = pd.Series(Theta)

    histogram_all = get_histogram(pred, theta_indices)
    total_count = histogram_all.sum()
    PMF_all = histogram_all / total_count

    max_DP_disp = 0
    for g in np.unique(sensitive_features):
        histogram_g = get_histogram(pred[sensitive_features == g], theta_indices)
        PMF_g = histogram_g / histogram_g.sum()
        max_DP_disp = max(max_DP_disp, np.max(np.abs(np.cumsum(PMF_all) - np.cumsum(PMF_g))))

    return max_DP_disp



# Metrics are independence and separation. Code source https://dalex.drwhy.ai/python-dalex-fairness-regression.html based on: Fairness Measures for Regression via Probabilistic Classification https://arxiv.org/pdf/2001.06089.pdf

def calculate_p4_m1(y, y_hat, protected, privileged):    
    unique_protected = np.unique(protected)
    unique_unprivileged = unique_protected[unique_protected != privileged]

    data = pd.DataFrame(columns=['subgroup', 'independence', 'separation', 'sufficiency'])

    for unprivileged in unique_unprivileged:
        # filter elements
        array_elements = np.isin(protected, [privileged, unprivileged])

        y_u = ((y[array_elements] - y[array_elements].mean()) / y[array_elements].std()).reshape(-1, 1)
        s_u = ((y_hat[array_elements] - y_hat[array_elements].mean()) / y_hat[array_elements].std()).reshape(-1, 1)

        a = np.where(protected[array_elements] == privileged, 1, 0)

        p_s = LogisticRegression()
        p_ys = LogisticRegression()
        p_y = LogisticRegression()
    

        p_s.fit(s_u, a)
        p_y.fit(y_u, a)
        p_ys.fit(np.c_[y_u, s_u], a)
        pred_p_s = p_s.predict_proba(s_u.reshape(-1, 1))[:, 1]
        pred_p_y = p_y.predict_proba(y_u.reshape(-1, 1))[:, 1]
        pred_p_ys = p_ys.predict_proba(np.c_[y_u, s_u])[:, 1]

        n = len(a)
    
        r_ind = ((n - a.sum()) / a.sum()) * (pred_p_s / (1 - pred_p_s)).mean()

        try:
            r_sep = ((pred_p_ys / (1 - pred_p_ys) * (1 - pred_p_y) / pred_p_y)).mean()
            r_suf = ((pred_p_ys / (1 - pred_p_ys)) * ((1 - pred_p_s) / pred_p_s)).mean()
        except:
            print('Error occured')
            r_sep = -1
            r_suf = -1

        to_append = pd.DataFrame({'subgroup': [unprivileged],
                                'independence': [r_ind],
                                'separation': [r_sep],
                                'sufficiency': [r_suf]})

        data = pd.concat([data, to_append])

    # append the scale
    to_append = pd.DataFrame({'subgroup': [privileged],
                            'independence': [1],
                            'separation': [1],
                            'sufficiency': [1]})
    ## TODO: this should be uncommented but adds blanks to the plots
    # data = pd.concat([data, to_append]) 

    data.index = data.subgroup
    data = data.iloc[:, 1:]
    return data



# Metric is Demographic Parity with Wasserstein Barycenters  (DP2). Code source https://github.com/lucaoneto/NIPS2020_Fairness based on: Fair Regression with Wasserstein Barycenters https://arxiv.org/pdf/2006.07286

def f_err(YT, YF):
    err = np.mean((YT - YF) ** 2)
    return err


def calculate_p2(Y, S):
    vv = np.unique(S)
    nn = [np.sum(S == v) for v in vv]
    Y_subsets = [Y[S == v] for v in vv]
    tt = np.linspace(min(Y), max(Y), 1000)
    sorted_subsets = [np.sort(subset) for subset in Y_subsets]
    
    cumulative_counts = [np.searchsorted(subset, tt, side='right') for subset in sorted_subsets]
    
    cdf_values = [counts / n for counts, n in zip(cumulative_counts, nn)]
    differences = np.abs(cdf_values[0] - cdf_values[1])
    fai = np.max(differences)
    
    return fai




# Metric is Demographic Parity with Renyi correlation (DP3). Code source https://github.com/criteo-research/continuous-fairness based on: Fairness-Aware Learning for Continuous Attributes and Treatments https://proceedings.mlr.press/v97/mary19a/mary19a.pdf

class kde:
    """
    A Gaussian KDE implemented in pytorch for the gradients to flow in pytorch optimization.

    Keep in mind that KDE are not scaling well with the number of dimensions and this implementation is not really
    optimized...
    """
    def __init__(self, x_train):
        n, d = x_train.shape

        self.n = n
        self.d = d

        self.bandwidth = (n * (d + 2) / 4.) ** (-1. / (d + 4))
        self.std = self.bandwidth

        self.train_x = x_train

    def pdf(self, x):
        s = x.shape
        d = s[-1]
        s = s[:-1]
        assert d == self.d

        data = x.unsqueeze(-2)

        train_x = _unsqueeze_multiple_times(self.train_x, 0, len(s))

        pdf_values = (
                        torch.exp(-((data - train_x).norm(dim=-1) ** 2 / (self.bandwidth ** 2) / 2))
                    ).mean(dim=-1) / sqrt(2 * pi) / self.bandwidth

        return pdf_values


def _unsqueeze_multiple_times(input, axis, times):
    """
    Utils function to unsqueeze tensor to avoid cumbersome code
    :param input: A pytorch Tensor of dimensions (D_1,..., D_k)
    :param axis: the axis to unsqueeze repeatedly
    :param times: the number of repetitions of the unsqueeze
    :return: the unsqueezed tensor. ex: dimensions (D_1,... D_i, 0,0,0, D_{i+1}, ... D_k) for unsqueezing 3x axis i.
    """
    output = input
    for i in range(times):
        output = output.unsqueeze(axis)
    return output

# Independence of 2 variables
def _joint_2(X, Y, density, damping=1e-10):
    X = (X - X.mean()) / X.std()
    Y = (Y - Y.mean()) / Y.std()
    data = torch.cat([X.unsqueeze(-1), Y.unsqueeze(-1)], -1)
    joint_density = density(data)

    nbins = int(min(50, 5. / joint_density.std))
    #nbins = np.sqrt( Y.size/5 )
    x_centers = torch.linspace(-2.5, 2.5, nbins)
    y_centers = torch.linspace(-2.5, 2.5, nbins)

    xx, yy = torch.meshgrid([x_centers, y_centers])
    grid = torch.cat([xx.unsqueeze(-1), yy.unsqueeze(-1)], -1)
    h2d = joint_density.pdf(grid) + damping
    h2d /= h2d.sum()
    return h2d


def calculate_p3(X, Y, damping = 1e-10):
    """
    An estimator of the Hirschfeld-Gebelein-Renyi maximum correlation coefficient using Witsenhausen’s Characterization:
    HGR(x,y) is the second highest eigenvalue of the joint density on (x,y). We compute here the second eigenvalue on
    an empirical and discretized density estimated from the input data.
    :param X: A torch 1-D Tensor
    :param Y: A torch 1-D Tensor
    :param density: so far only kde is supported
    :return: numerical value between 0 and 1 (0: independent, 1:linked by a deterministic equation)
    """
    h2d = _joint_2(X, Y, kde, damping=damping)
    marginal_x = h2d.sum(dim=1).unsqueeze(1)
    marginal_y = h2d.sum(dim=0).unsqueeze(0)
    Q = h2d / (torch.sqrt(marginal_x) * torch.sqrt(marginal_y))
    return torch.svd(Q)[1][1]



def _joint_3(X, Y, Z, density, damping=1e-10):
    X = (X - X.mean()) / X.std()
    Y = (Y - Y.mean()) / Y.std()
    Z = (Z - Z.mean()) / Z.std()
    data = torch.cat([X.unsqueeze(-1), Y.unsqueeze(-1), Z.unsqueeze(-1)], -1)
    joint_density = density(data)  # + damping

    nbins = int(min(50, 5. / joint_density.std))
    x_centers = torch.linspace(-2.5, 2.5, nbins)
    y_centers = torch.linspace(-2.5, 2.5, nbins)
    z_centers = torch.linspace(-2.5, 2.5, nbins)
    xx, yy, zz = torch.meshgrid([x_centers, y_centers, z_centers])
    grid = torch.cat([xx.unsqueeze(-1), yy.unsqueeze(-1), zz.unsqueeze(-1)], -1)

    h3d = joint_density.pdf(grid) + damping
    h3d /= h3d.sum()
    return h3d


def calculate_m2(X, Y, Z):
    """
    An estimator of the function z -> HGR(x|z, y|z) where HGR is the Hirschfeld-Gebelein-Renyi maximum correlation
    coefficient computed using Witsenhausen’s Characterization: HGR(x,y) is the second highest eigenvalue of the joint
    density on (x,y). We compute here the second eigenvalue on
    an empirical and discretized density estimated from the input data.
    :param X: A torch 1-D Tensor
    :param Y: A torch 1-D Tensor
    :param Z: A torch 1-D Tensor
    :param density: so far only kde is supported
    :return: A torch 1-D Tensor of same size as Z. (0: independent, 1:linked by a deterministic equation)
    """
    damping = 1e-10
    h3d = _joint_3(X, Y, Z, kde, damping=damping)
    marginal_xz = h3d.sum(dim=1).unsqueeze(1)
    marginal_yz = h3d.sum(dim=0).unsqueeze(0)
    Q = h3d / (torch.sqrt(marginal_xz) * torch.sqrt(marginal_yz))
    return np.array(([torch.svd(Q[:, :, i])[1][1] for i in range(Q.shape[2])]))



def calc_all_metrics(pred,sens):
    
    if (sens == data_class.protected_Hispanic_vs_other).all() :
        sens = np.where(sens == 'Hispanic', 0, 1)
        group = 'Hispanic_vs_other'
    elif (sens == data_class.protected_white_none_Hispanic_vs_other).all() :
        sens = np.where(sens == 'Other', 0, 1)
        group = 'white_none_Hispanic_vs_other'
    elif (sens == data_class.protected_white_vs_other).all() :
        sens = np.where(sens == 'None White', 0, 1)
        group = 'white_vs_other'
    else:
        print('Sensitive  attribute not found')
        return 


    uniqe_val = len(set(pred))
    mse = data_class.calc_mse_percent_val(pred)
    pred = data_class.get_pred_descalled_from_perecent(pred)
    err_abs = abs(pred - data_class.y_percentage_val)
    
    print('group =', group)
    print('RMSE =', mse)


    p1 = calc_p1(pred,sens)
    print('P1 =',p1)

    p2 = calculate_p2(pred,sens)
    print('P2 =',p2)

    p3 = float(calculate_p3((torch.Tensor(pred)),torch.Tensor(sens)))
    print('P3 =',p3)

    pm = calculate_p4_m1(data_class.y_percentage_val, pred, sens, 1)
    
    p4 = round(pm['independence'].values[0],2)
    m1 = round(pm['separation'].values[0],2)
    print('P4 =',p4)
    print('M1 =',m1)

    m2 =  np.max(calculate_m2(torch.Tensor(pred),torch.Tensor(sens),torch.Tensor(data_class.y_percentage_val)))
    print('M2 =',m2)



    err_abs_p1 = calc_p1(err_abs,sens)
    print('err_abs P1 =',err_abs_p1)

    err_abs_p2 = calculate_p2(err_abs,sens)
    print('err_abs P2 =',err_abs_p2)

    err_abs_p3 = float(calculate_p3((torch.Tensor(err_abs)),torch.Tensor(sens)))
    print('err_abs P3 =',err_abs_p3)

    pm = calculate_p4_m1(data_class.y_percentage_val, err_abs, sens, 1)
    err_abs_p4 = round(pm['independence'].values[0],2)
    print('err_abs P4 =',err_abs_p4)


    print()

    return [group, mse, p1, p2, p3, p4, m1, m2,err_abs_p1 ,err_abs_p2 , err_abs_p3, err_abs_p4,uniqe_val ]



### Loading and processing the data

In [None]:
data_df_orginal = pd.read_csv("../Ml_ready_all_percent.csv")


In [None]:
data_class = Data_info(data_df_orginal)

In [None]:
fairness_results = []
sens_groups = [data_class.protected_white_vs_other, data_class.protected_Hispanic_vs_other, data_class.protected_white_none_Hispanic_vs_other]

### XGBoost

In [None]:
lowest_mse = math.inf
max_depth = [4, 6, 12]
n_estimators = [25, 50]
learning_rate = [0.1, 0.01]


for max_d in max_depth:
    for n_estim in n_estimators:
        for lr in learning_rate:
            xgb_model_orginal = xgb.XGBRegressor(n_estimators=n_estim, max_depth=max_d, eta=lr)
            xgb_model_orginal.fit(data_class.X_train_scaled, data_class.y_percentage_train_scaled)
            y_pred = xgb_model_orginal.predict(data_class.X_test_scaled)
            mse = data_class.calc_mse_percent_test(y_pred)
            if mse < lowest_mse:
                pickle.dump(xgb_model_orginal, open('Orginal_experment_models_and_reults/xgb_model','wb'))
                lowest_mse = mse
                xgb_best_depth, xgb_best_n, xgb_best_lr = max_d, n_estim, lr
                print("Best mse=",lowest_mse, "The best hyperparameters are: ",'best_depth=',xgb_best_depth, 'best_n=',xgb_best_n, 'best_lr=',xgb_best_lr)  
            else:
                print("Skipped")    


XGBoost

In [None]:
xgb_model_orginal = pickle.load(open('Orginal_experment_models_and_reults/xgb_model','rb'))
y_pred = xgb_model_orginal.predict(data_class.X_val_scaled)
np.savetxt("Orginal_experment_models_and_reults/xgb_model.csv", y_pred, delimiter=',')

for group in sens_groups:
    results = calc_all_metrics(y_pred, group)
    results.append('xgb')
    fairness_results.append(results)


### Random Forest

In [None]:
lowest_mse = math.inf

max_depth = [4, 6, 12]
n_estimators = [50, 100, 200]

for max_d in max_depth:
    for n_estim in n_estimators:
        RF_model_orginal = RandomForestRegressor(max_depth=max_d, n_estimators=n_estim, n_jobs= mp.cpu_count())
        RF_model_orginal.fit(data_class.X_train_scaled, data_class.y_percentage_train_scaled)
        y_pred = RF_model_orginal.predict(data_class.X_test_scaled)
        mse = data_class.calc_mse_percent_test(y_pred)
        if mse < lowest_mse:
            pickle.dump(RF_model_orginal, open('Orginal_experment_models_and_reults/RF_model', 'wb'))
            lowest_mse = mse
            RF_best_depth,  RF_best_n = max_d, n_estim
            print("Best mse=",lowest_mse, "The best hyperparameters are: ",'best_depth=',RF_best_depth, 'best_n=',RF_best_n)  
        else:
            print("Skipped")

In [None]:
RF_model_orginal = pickle.load(open('Orginal_experment_models_and_reults/RF_model','rb'))
y_pred = RF_model_orginal.predict(data_class.X_val_scaled)

np.savetxt("Orginal_experment_models_and_reults/RF_model.csv", y_pred, delimiter=',')

for group in sens_groups:
    results = calc_all_metrics(y_pred, group)
    results.append('RF')
    fairness_results.append(results)

## Tabnet

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
lowest_mse = math.inf

max_width = [8, 16]
n_steps = [3, 6]

for max_w in max_width:
    for n_s in n_steps :
        tbn_model_orginal = TabNetRegressor( n_d=max_w, n_a=max_w, n_steps=n_s, device_name=device, verbose=1)
        tbn_model_orginal.fit(data_class.X_train_scaled,data_class.y_percentage_train_scaled.reshape(-1,1), patience=10,eval_set=[(data_class.X_test_scaled, data_class.y_percentage_test_scaled.reshape(-1,1))])
        y_pred = tbn_model_orginal.predict(data_class.X_test_scaled)
        mse = data_class.calc_mse_percent_test(y_pred)
        if mse < lowest_mse:
            pickle.dump(tbn_model_orginal, open('Orginal_experment_models_and_reults/tbn_model', 'wb'))
            lowest_mse = mse
            TBN_best_width, TBN_best_n = max_w, n_s, 
            print("Best mse=",lowest_mse, "The best hyperparameters are: ",'best_width=',TBN_best_width, 'best_n=',TBN_best_n)  
        else:
            print("Skipped")    

# Best mse= 0.16364967155661278 The best hyperparameters are:  best_width= 16 best_n= 6

In [None]:
tbn_model_orginal = pickle.load(open('Orginal_experment_models_and_reults/tbn_model','rb'))
y_pred = tbn_model_orginal.predict(data_class.X_val_scaled)

np.savetxt("Orginal_experment_models_and_reults/TBN_model.csv", y_pred, delimiter=',')

for group in sens_groups:
    results = calc_all_metrics(y_pred, group)
    results.append('TBN')
    fairness_results.append(results)

### MLP

In [None]:
lowest_mse = math.inf

alpha= [0.001, 0.01, 0.1]
hidden_layer_sizes=[(256, ), (256, 128, ), (256, 128, 64, ), (256, 128, 64, 32 )]

for hidden in hidden_layer_sizes:
    for al in alpha :
        mlp_model_orginal = MLPRegressor( hidden_layer_sizes=hidden,verbose=False, alpha=al, early_stopping=True)
        mlp_model_orginal.fit(data_class.X_train_scaled, data_class.y_percentage_train_scaled)
        y_pred = mlp_model_orginal.predict(data_class.X_test_scaled)
        mse = data_class.calc_mse_percent_test(y_pred)
        if mse < lowest_mse:
            pickle.dump(mlp_model_orginal, open('Orginal_experment_models_and_reults/mlp_model', 'wb'))
            lowest_mse = mse
            mlp_best_hidden, mlp_best_al = hidden, al, 
            print("Best mse=",lowest_mse, "The best hyperparameters are: ",'best_hidden=',mlp_best_hidden, 'best_al=',mlp_best_al)  
        else:
            print("Skipped")    

# Best mse= 0.17042182865687036 The best hyperparameters are:  best_hidden= (256, 128, 64) best_al= 0.001


In [None]:
mlp_model_orginal = pickle.load(open('Orginal_experment_models_and_reults/mlp_model','rb'))
y_pred = mlp_model_orginal.predict(data_class.X_val_scaled)

np.savetxt("Orginal_experment_models_and_reults/MLP_model.csv", y_pred, delimiter=',')

for group in sens_groups:
    results = calc_all_metrics(y_pred, group)
    results.append('MLP')
    fairness_results.append(results)

LSTM

In [None]:
data = pd.DataFrame(data_class.X_train_scaled, columns=data_class.X_train.columns)

new_data = []
for ind in range(len(data)):
    for i in range(2010,2019):
        new_data.append(pd.concat((data.iloc[ind][['beds_redfin','baths_redfin','sqft_redfin','appraised_val_2020','house_age_train']],data.iloc[ind, data.columns.str.contains(f'{i}_', case=False)])))
new_data_values = []
for i in range(len(new_data)):
    new_data_values.append(new_data[i].values)

X_train_reshaped_LSTM = []
for i in range(0,len(new_data_values),9):
    X_train_reshaped_LSTM.append(new_data_values[i:i+9])


data = pd.DataFrame(data_class.X_test_scaled, columns=data_class.X_test.columns)

new_data = []
for ind in range(len(data)):
    for i in range(2011,2020):
        new_data.append(pd.concat((data.iloc[ind][['beds_redfin','baths_redfin','sqft_redfin','appraised_val_2021','house_age_test']],data.iloc[ind, data.columns.str.contains(f'{i}_', case=False)])))

new_data_values = []
for i in range(len(new_data)):
    new_data_values.append(new_data[i].values)

X_test_reshaped_LSTM = []
for i in range(0,len(new_data_values),9):
    X_test_reshaped_LSTM.append(new_data_values[i:i+9])



data = pd.DataFrame(data_class.X_val_scaled, columns=data_class.X_val.columns)

new_data = []

for ind in range(len(data)):
    for i in range(2012,2021):
        new_data.append(pd.concat((data.iloc[ind][['beds_redfin','baths_redfin','sqft_redfin','appraised_val_2022','house_age_val']],data.iloc[ind, data.columns.str.contains(f'{i}_', case=False)])))

new_data_values = []
for i in range(len(new_data)):
    new_data_values.append(new_data[i].values)

X_val_reshaped_LSTM = []
for i in range(0,len(new_data_values),9):
    X_val_reshaped_LSTM.append(new_data_values[i:i+9])


In [None]:
np.save('X_train_reshaped_LSTM.npy', X_train_reshaped_LSTM)
np.save('X_test_reshaped_LSTM.npy', X_test_reshaped_LSTM)
np.save('X_val_reshaped_LSTM.npy', X_val_reshaped_LSTM)

In [None]:
X_train_reshaped_LSTM = np.load('Data_reshaped/X_train_reshaped_LSTM.npy')
X_test_reshaped_LSTM = np.load('Data_reshaped/X_test_reshaped_LSTM.npy')
X_val_reshaped_LSTM = np.load('Data_reshaped/X_val_reshaped_LSTM.npy')

X_train = np.array(X_train_reshaped_LSTM)
y_train = data_class.y_percentage_train_scaled

X_test = np.array(X_test_reshaped_LSTM)
y_test = data_class.y_percentage_test_scaled

X_val = np.array(X_val_reshaped_LSTM)
y_val = data_class.y_percentage_val_scaled

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size = 2, output_size=1, num_layers=1):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        output = self.fc(lstm_out[:, -1, :])
        return output

    def fit(self, X_train, y_train, epochs=25, learning_rate=0.00001, batch_size=64,  device='cpu'):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)


        dataset = TensorDataset(torch.from_numpy(X_train).float().to(device), torch.from_numpy(y_train).float().to(device))
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


        for epoch in range(epochs):
            self.train()
            for inputs, labels in dataloader:
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}')



    def predict(self, X_test, device='cpu'):
        self.to(device)
        dataset = TensorDataset(torch.from_numpy(X_test).float().to(device))
        dataloader = DataLoader(dataset, batch_size=64) 
        predictions = []

        self.eval()
        with torch.no_grad():
            for inputs in dataloader:
                outputs = self(inputs[0])
                predictions.append(outputs.cpu().numpy())

        return np.concatenate(predictions, axis=0)



In [None]:
input_size = X_train.shape[2]
LSTMPredictor = LSTM(input_size)
LSTMPredictor.fit(X_train, y_train,device='cuda')

y_pred = LSTMPredictor.predict(X_val,device='cuda')

In [None]:
data_class.calc_mse_percent_val(y_pred.flatten())
np.savetxt("Orginal_experment_models_and_reults/lstm_model.csv", y_pred, delimiter=',')

In [None]:
calc_all_metrics(y_pred, sens_groups[0])

In [None]:
for group in sens_groups:
    results = calc_all_metrics(y_pred, group)
    results.append('LSTM')
    fairness_results.append(results)

### GRU

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size=2, output_size=1, num_layers=1):

        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):

        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        gru_out, _ = self.gru(x, h0)
        output = self.fc(gru_out[:, -1, :])
        return output

    def fit(self, X_train, y_train, epochs=25, learning_rate=0.00001, batch_size=64, device='cpu'):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


        for epoch in range(epochs):
            for inputs, labels in dataloader:
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}')

    
    def predict(self, X_test, device='cpu'):
        self.to(device)
        dataset = TensorDataset(torch.from_numpy(X_test).float().to(device))
        dataloader = DataLoader(dataset, batch_size=64)
        predictions = []

        self.eval()
        with torch.no_grad():
            for inputs in dataloader:
                outputs = self(inputs[0])
                predictions.append(outputs.cpu().numpy())

        return np.concatenate(predictions, axis=0)


In [None]:
input_size = X_train.shape[2]
GRUPredictor = GRU(input_size)
GRUPredictor.fit(X_train, y_train)
y_pred = GRUPredictor.predict(X_val)


np.savetxt("Orginal_experment_models_and_reults/gru_model.csv", y_pred, delimiter=',')

In [None]:
for group in sens_groups:
    results = calc_all_metrics(y_pred, group)
    results.append('GRU')
    fairness_results.append(results)

In [None]:
models = ['xgb','RF','TBN','MLP','lstm','gru']
fairness_results = []
for model in models:
    y_pred = np.loadtxt(f"Orginal_experment_models_and_reults/{model}_model.csv", delimiter=',')
    for group in sens_groups:
        results = calc_all_metrics(y_pred.flatten(), group)
        results.append(model)
        fairness_results.append(results)

In [None]:
cols = ['Group','RMSE','P1','P2','P3','P4','M1','M2','err_abs_p1' ,'err_abs_p2' , 'err_abs_p3', 'err_abs_p4','Unique values', 'Model']
metrics_orginal = pd.DataFrame(fairness_results,columns=cols)
metrics_orginal[[ 'Model', 'Group','RMSE','P1','P2','P3','P4','M1','M2','err_abs_p1' ,'err_abs_p2' , 'err_abs_p3', 'err_abs_p4','Unique values']]

In [None]:
metrics_orginal[[ 'Model', 'Group','RMSE','P1','P2','P3','P4','M1','M2','err_abs_p1' ,'err_abs_p2' , 'err_abs_p3', 'err_abs_p4','Unique values']].to_csv('metrics_orginal.csv',index=False)

In [None]:
groups = [[data_class.protected_white_vs_other,'none_white'],[ data_class.protected_Hispanic_vs_other,'Hispanic'],[ data_class.protected_white_none_Hispanic_vs_other, 'Other']]

In [None]:
models = ['xgb','rf','tbn','mlp','lstm','gru']
fairness_results2 = []
for model in models:    
    for group in groups:
        y_pred = np.loadtxt(f"Correlation_removed_models_and_results/{model}_model_cr_results_{group[1]}.csv", delimiter=',')
        results = calc_all_metrics(y_pred, group[0])
        results.append(model)
        fairness_results2.append(results)

In [None]:
cols = ['Group','RMSE','P1','P2','P3','P4','M1','M2','err_abs_p1' ,'err_abs_p2' , 'err_abs_p3', 'err_abs_p4','Unique values', 'Model']
metrics_cr = pd.DataFrame(fairness_results2,columns=cols)
metrics_cr[[ 'Model', 'Group','RMSE','P1','P2','P3','P4','M1','M2','err_abs_p1' ,'err_abs_p2' , 'err_abs_p3', 'err_abs_p4','Unique values']] 