In [None]:
import pandas as pd
import numpy   as np
import xgboost as xgb
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import  RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import  mean_squared_error
from pytorch_tabnet.tab_model import TabNetRegressor
from fairlearn.preprocessing import CorrelationRemover
from collections import defaultdict
import torch
import multiprocessing as mp
import pickle
import math
import warnings
from sklearn.neural_network import MLPRegressor
from Data_info import *
warnings.simplefilter(action='ignore', category=FutureWarning)


### Loading and processing the data

In [None]:
data_df_orginal = pd.read_csv("../Ml_ready_all_percent.csv")
data_df_orginal

In [None]:
data_class = Data_info(data_df_orginal)

## Correlation Remover

In [None]:
X_train_new = pd.DataFrame(data_class.X_train_scaled,columns= data_class.X_train.columns)
X_train_new['Race'] = data_class.protected_white_vs_other
X_train_new = pd.get_dummies(X_train_new)
X_train_new

In [None]:
# correlation_matrix on the orginal data 

correlation_matrix = X_train_new.corr()
race_col = 'Race_None White'
top_correlations = correlation_matrix[race_col].abs().sort_values(ascending=False).head(20)[1:]
print("Correlation between 'race' and the top 20 correlating columns:")
print(top_correlations)

In [None]:
cr = CorrelationRemover(sensitive_feature_ids=[race_col])
X_cr = cr.fit_transform(X_train_new)
X_cr = pd.DataFrame(
    X_cr, columns=X_train_new.drop(columns=race_col).columns
)
X_cr[race_col] = X_train_new[race_col]
X_cr

In [None]:
correlation_matrix = X_cr.corr()
top_correlations = correlation_matrix[race_col].abs().sort_values(ascending=False).head(25)[1:]
print("Correlation between rave and the top 10 correlating columns:")
top_correlations

In [None]:
pd.DataFrame(top_correlations).head(11)

In [None]:
X_test_new = pd.DataFrame(data_class.X_test_scaled,columns=data_class.X_test.columns)
X_test_new['Race'] = data_class.protected_white_vs_other
X_test_new = pd.get_dummies(X_test_new)

X_val_new = pd.DataFrame(data_class.X_val_scaled,columns=data_class.X_val.columns)
X_val_new['Race'] = data_class.protected_white_vs_other
X_val_new = pd.get_dummies(X_val_new)

X_test_fitted_none_white = cr.transform(X_test_new)
X_val_fitted_none_white = cr.transform(X_val_new)

X_train_fitted_none_white = np.array(X_cr.values[:, :-2], dtype=float)
X_test_fitted_none_white = np.array(X_test_fitted_none_white[:, :-1], dtype=float)
X_val_fitted_none_white = np.array(X_val_fitted_none_white[:, :-1], dtype=float)

(X_train_fitted_none_white.shape,X_test_fitted_none_white.shape,X_val_fitted_none_white.shape)

Remove Correlation for the second sensitive groups

In [None]:
X_train_new = pd.DataFrame(data_class.X_train_scaled,columns=data_class.X_train.columns)
X_train_new['Race'] = data_class.protected_Hispanic_vs_other
X_train_new = pd.get_dummies(X_train_new)

correlation_matrix = X_train_new.corr()
race_col = 'Race_Hispanic'
top_correlations = correlation_matrix[race_col].abs().sort_values(ascending=False).head(20)[1:]
print("Correlation between 'race' and the top 20 correlating columns:")
top_correlations

In [None]:
pd.DataFrame(top_correlations).head(11)

In [None]:
cr = CorrelationRemover(sensitive_feature_ids=[race_col])
X_cr = cr.fit_transform(X_train_new)
X_cr = pd.DataFrame(
    X_cr, columns=X_train_new.drop(columns=race_col).columns)
X_cr[race_col] = X_train_new[race_col]

correlation_matrix = X_cr.corr()
top_correlations = correlation_matrix[race_col].abs().sort_values(ascending=False).head(25)[1:]

print("Correlation between rave and the top 10 correlating columns:")
print(top_correlations)

In [None]:
pd.DataFrame(top_correlations).head(11)

In [None]:
X_test_new = pd.DataFrame(data_class.X_test_scaled,columns=data_class.X_test.columns)
X_test_new['Race'] = data_class.protected_Hispanic_vs_other
X_test_new = pd.get_dummies(X_test_new)

X_val_new = pd.DataFrame(data_class.X_val_scaled,columns=data_class.X_val.columns)
X_val_new['Race'] = data_class.protected_Hispanic_vs_other
X_val_new = pd.get_dummies(X_val_new)

X_test_fitted_Hispanic = cr.transform(X_test_new)
X_val_fitted_Hispanic = cr.transform(X_val_new)

X_train_fitted_Hispanic = np.array(X_cr.values[:, :-2], dtype=float)
X_test_fitted_Hispanic = np.array(X_test_fitted_Hispanic[:, :-1], dtype=float)
X_val_fitted_Hispanic = np.array(X_val_fitted_Hispanic[:, :-1], dtype=float)

(X_train_fitted_Hispanic.shape,X_test_fitted_Hispanic.shape,X_val_fitted_Hispanic.shape)

Remove Correlation for the third sensitive groups

In [None]:
X_train_new = pd.DataFrame(data_class.X_train_scaled,columns=data_class.X_train.columns)
X_train_new['Race'] = data_class.protected_white_none_Hispanic_vs_other
X_train_new = pd.get_dummies(X_train_new)

# correlation_matrix on the orginal data 
correlation_matrix = X_train_new.corr()
race_col = 'Race_Other'
top_correlations = correlation_matrix[race_col].abs().sort_values(ascending=False).head(20)[1:]
print("Correlation between 'race' and the top 20 correlating columns:")
print(top_correlations)

In [None]:
pd.DataFrame(top_correlations).head(11)

In [None]:
cr = CorrelationRemover(sensitive_feature_ids=[race_col])
X_cr = cr.fit_transform(X_train_new)
X_cr = pd.DataFrame(X_cr, columns=X_train_new.drop(columns=race_col).columns)
X_cr[race_col] = X_train_new[race_col]

correlation_matrix = X_cr.corr()
top_correlations = correlation_matrix[race_col].abs().sort_values(ascending=False).head(25)[1:]

print("Correlation between rave and the top 10 correlating columns:")
print(top_correlations)

In [None]:
pd.DataFrame(top_correlations).head(11)

In [None]:
X_test_new = pd.DataFrame(data_class.X_test_scaled,columns=data_class.X_test.columns)
X_test_new['Race'] = data_class.protected_white_none_Hispanic_vs_other
X_test_new = pd.get_dummies(X_test_new)

X_val_new = pd.DataFrame(data_class.X_val_scaled,columns=data_class.X_val.columns)
X_val_new['Race'] = data_class.protected_white_none_Hispanic_vs_other
X_val_new = pd.get_dummies(X_val_new)

X_test_fitted_Other = cr.transform(X_test_new)
X_val_fitted_Other = cr.transform(X_val_new)

X_train_fitted_Other = np.array(X_cr.values[:, :-2], dtype=float)
X_test_fitted_Other = np.array(X_test_fitted_Other[:, :-1], dtype=float)
X_val_fitted_Other = np.array(X_val_fitted_Other[:, :-1], dtype=float)

(X_train_fitted_Other.shape,X_test_fitted_Other.shape,X_val_fitted_Other.shape)

Reshape for LSTM

In [None]:
def reshape_data_for_LSTM(train,test,val,group):
    data = pd.DataFrame(train, columns=data_class.X_train.columns)

    new_data = []
    for ind in range(len(data)):
        for i in range(2010,2019):
            new_data.append(pd.concat((data.iloc[ind][['beds_redfin','baths_redfin','sqft_redfin','appraised_val_2020','house_age_train']],data.iloc[ind, data.columns.str.contains(f'{i}_', case=False)])))
    new_data_values = []
    for i in range(len(new_data)):
        new_data_values.append(new_data[i].values)

    X_train_reshaped_LSTM = []
    for i in range(0,len(new_data_values),9):
        X_train_reshaped_LSTM.append(new_data_values[i:i+9])


    #########   
    data = pd.DataFrame(test, columns=data_class.X_test.columns)

    new_data = []
    for ind in range(len(data)):
        for i in range(2011,2020):
            new_data.append(pd.concat((data.iloc[ind][['beds_redfin','baths_redfin','sqft_redfin','appraised_val_2021','house_age_test']],data.iloc[ind, data.columns.str.contains(f'{i}_', case=False)])))

    new_data_values = []
    for i in range(len(new_data)):
        new_data_values.append(new_data[i].values)

    X_test_reshaped_LSTM = []
    for i in range(0,len(new_data_values),9):
        X_test_reshaped_LSTM.append(new_data_values[i:i+9])



    #########   
    data = pd.DataFrame(val, columns=data_class.X_val.columns)

    new_data = []

    for ind in range(len(data)):
        for i in range(2012,2021):
            new_data.append(pd.concat((data.iloc[ind][['beds_redfin','baths_redfin','sqft_redfin','appraised_val_2022','house_age_val']],data.iloc[ind, data.columns.str.contains(f'{i}_', case=False)])))

    new_data_values = []
    for i in range(len(new_data)):
        new_data_values.append(new_data[i].values)

    X_val_reshaped_LSTM = []
    for i in range(0,len(new_data_values),9):
        X_val_reshaped_LSTM.append(new_data_values[i:i+9])


    np.save(f'Data_reshaped/X_train_reshaped_cr_{group}.npy', X_train_reshaped_LSTM)
    np.save(f'Data_reshaped/X_test_reshaped_cr_{group}.npy', X_test_reshaped_LSTM)
    np.save(f'Data_reshaped/X_val_reshaped_cr_{group}.npy', X_val_reshaped_LSTM)

In [None]:

reshape_data_for_LSTM(X_train_fitted_Other,X_test_fitted_Other,X_val_fitted_Other,'Other')
reshape_data_for_LSTM(X_train_fitted_Hispanic,X_test_fitted_Hispanic,X_val_fitted_Hispanic,'Hispanic')
reshape_data_for_LSTM(X_train_fitted_none_white,X_test_fitted_none_white,X_val_fitted_none_white,'none_white')

## Testing after removing correlation

XGBoost  - Train on (white vs none white)

In [None]:
groups = ['Other','Hispanic','none_white']
data = [[X_train_fitted_Other,X_test_fitted_Other,X_val_fitted_Other],
          [X_train_fitted_Hispanic,X_test_fitted_Hispanic,X_val_fitted_Hispanic],
          [X_train_fitted_none_white,X_test_fitted_none_white,X_val_fitted_none_white]]

In [None]:
xgb_best_depth = 12
xgb_best_n     = 50
xgb_best_lr    = 0.01

c = 0
for d in data:
    train = d[0]
    test = d[1]
    val =  d[2]
    group = groups[c]

    xgb_model = xgb.XGBRegressor(n_estimators=xgb_best_n, max_depth=xgb_best_depth, eta=xgb_best_lr)
    xgb_model.fit(train, data_class.y_percentage_train_scaled)
    pickle.dump(xgb_model, open(f'Correlation_removed_models_and_results/xgb_model_cr_{group}','wb'))
    y_pred = xgb_model.predict(val)     
    np.savetxt(f"Correlation_removed_models_and_results/xgb_model_cr_results_{group}.csv", y_pred, delimiter=',')
    c+=1



RF  - Train on ( white vs none white)

In [None]:
RF_best_depth = 4
RF_best_n     = 200

c = 0
for d in data:
    train = d[0]
    test = d[1]
    val =  d[2]
    group = groups[c]

    rf_model = RandomForestRegressor(max_depth=RF_best_depth, n_estimators=RF_best_n, n_jobs= mp.cpu_count())
    rf_model.fit(train, data_class.y_percentage_train_scaled)
    pickle.dump(rf_model, open(f'Correlation_removed_models_and_results/rf_model_cr_{group}','wb'))
    y_pred = rf_model.predict(val)     
    np.savetxt(f"Correlation_removed_models_and_results/rf_model_cr_results_{group}.csv", y_pred, delimiter=',')
    c+=1



TBN  - Train on ( white vs non white)

In [None]:
TBN_best_width = 8
TBN_best_n     = 3

c = 0
for d in data:
    train = d[0]
    test = d[1]
    val =  d[2]
    group = groups[c]

    tbn_model = TabNetRegressor(n_d=TBN_best_width, n_a=TBN_best_width, n_steps=TBN_best_n, verbose=0)
    tbn_model.fit(train, data_class.y_percentage_train_scaled.reshape(-1,1), patience=10,eval_set=[(test, data_class.y_percentage_test_scaled.reshape(-1,1))])
    pickle.dump(tbn_model, open(f'Correlation_removed_models_and_results/tbn_model_cr_{group}','wb'))
    y_pred = tbn_model.predict(val)     
    np.savetxt(f"Correlation_removed_models_and_results/tbn_model_cr_results_{group}.csv", y_pred, delimiter=',')
    c+=1


MLP  - Train on (white vs none white)

In [None]:
mlp_best_hidden= (256) 
mlp_best_al= 0.001

c = 0
for d in data:
    train = d[0]
    test = d[1]
    val =  d[2]
    group = groups[c]

    mlp_model = MLPRegressor( hidden_layer_sizes=mlp_best_hidden,verbose=False, alpha=mlp_best_al, early_stopping=True )
    mlp_model.fit(train, data_class.y_percentage_train_scaled)
    pickle.dump(mlp_model, open(f'Correlation_removed_models_and_results/mlp_model_cr_{group}','wb'))
    y_pred = mlp_model.predict(val)     
    np.savetxt(f"Correlation_removed_models_and_results/mlp_model_cr_results_{group}.csv", y_pred, delimiter=',')
    c+=1

## LSTM

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        output = self.fc(lstm_out[:, -1, :])
        return output

    def fit(self, X_train, y_train,X_val, y_val, epochs=25, learning_rate=0.001, batch_size=32, patience=10):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        best_val_loss = float('inf')
        dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        val_dataset = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


        for epoch in range(epochs):
            for inputs, labels in dataloader:
                inputs, labels = inputs, labels
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()


            val_loss = 0.0
            with torch.no_grad():
                for val_inputs, val_labels in val_dataloader:
                    val_inputs, val_labels = val_inputs, val_labels
                    val_outputs = self(val_inputs)
                    val_loss += criterion(val_outputs, val_labels).item()

            val_loss /= len(val_dataloader)

            print(f'Epoch [{epoch + 1}/{epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}')

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                early_stopping_counter = 0
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= patience:
                    print(f'Early stopping after {patience} epochs without improvement.')
                    break


    def predict(self, X_test):
        inputs = torch.from_numpy(X_test).float()
        with torch.no_grad():
            predictions = self(inputs)
        return predictions.cpu().numpy()


lstm_best_hidden_siz =  2
lstm_best_num_layers = 1
lstm_best_lr = 0.00001

LSTM  - Train on (white vs none white)

In [None]:
c = 0
for  group in groups:
    train = np.load(f'Data_reshaped/X_train_reshaped_cr_{group}.npy')
    test = np.load(f'Data_reshaped/X_test_reshaped_cr_{group}.npy')
    val =  np.load(f'Data_reshaped/X_val_reshaped_cr_{group}.npy')
    input_size = train.shape[2]


    LSTMPredictor = LSTM(input_size, hidden_size=lstm_best_hidden_siz, num_layers=lstm_best_num_layers)
    LSTMPredictor.fit(train, data_class.y_percentage_train_scaled, test, data_class.y_percentage_test_scaled)
    y_pred = LSTMPredictor.predict(val)     
    np.savetxt(f"Correlation_removed_models_and_results/lstm_model_cr_results_{group}.csv", y_pred, delimiter=',')
    c+=1


In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        gru_out, _ = self.gru(x, h0)
        output = self.fc(gru_out[:, -1, :])
        return output

    def fit(self, X_train, y_train,  epochs=25, learning_rate=0.01, batch_size=64):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)

        dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

       
        for epoch in range(epochs):
            for inputs, labels in dataloader:
                inputs, labels = inputs, labels
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}')


    def predict(self, X_test):
        inputs = torch.from_numpy(X_test).float()
        with torch.no_grad():
            predictions = self(inputs)
        return predictions.cpu().numpy()
    
gru_best_hidden_siz =  25
gru_best_num_layers = 2
gru_best_lr = 0.1


In [None]:
c = 0
for  group in groups:
    train = np.load(f'Data_reshaped/X_train_reshaped_cr_{group}.npy')
    test = np.load(f'Data_reshaped/X_test_reshaped_cr_{group}.npy')
    val =  np.load(f'Data_reshaped/X_val_reshaped_cr_{group}.npy')
    input_size = train.shape[2]


    GRUPredictor = GRU(input_size, hidden_size=gru_best_hidden_siz, num_layers=gru_best_num_layers)

    GRUPredictor.fit(train,  data_class.y_percentage_train_scaled,   learning_rate=gru_best_lr)
    y_pred = GRUPredictor.predict(val)     
    np.savetxt(f"Correlation_removed_models_and_results/gru_model_cr_results_{group}.csv", y_pred, delimiter=',')
    c+=1
