In [None]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import plotly
import plotly.express as px
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, preprocessing
from datetime import datetime, timedelta

import wandb
import random

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# New Section

In [None]:
def label_encode(df):

  for i in df.columns:
    tmp = df[i].iloc[0]
    
    if type(tmp) != int:
      le = preprocessing.LabelEncoder()
      df[i] = le.fit_transform(df[i].values)

  return df

def one_hot(df):
  
  for i in df.columns:
    tmp = df[i].iloc[0]
    
    if type(tmp) != int:
      ohe = preprocessing.OneHotEncoder()
      df[i] =  ohe.fit_transform(df[i].values.reshape(-1, 1)) # double check the shape

  return df


def get_dataset(name):
  if name == 'compas':
    compas_link ='https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
    df = pd.read_csv(compas_link, on_bad_lines='skip')

    df = label_encode(df)
  else: 
    df = pd.read_csv("/content/drive/MyDrive/Project/Data/loan-train.csv")
    df = label_encode(df)
  return df

def split(x, y, sensitive_features, train_ratio, test_ratio):
  validation_ratio = 100 - train_ratio - test_ratio

  if validation_ratio < 0:
    print ("Incorrect Ratios")
    return -1

  # train is now 75% of the entire data set
  x_train, x_test, y_train, y_test, a_train, a_test = train_test_split(x, y, sensitive_features, test_size=1 - train_ratio, random_state=42)

  # test is now 10% of the initial data set
  # validation is now 15% of the initial data set
  x_val, x_test, y_val, y_test, a_val, a_test = train_test_split(x_test, y_test, a_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42) 

  return  x_train, x_test, x_val, y_train, y_test, y_val, a_train, a_test, a_val

def mae(prediction, true):
  return metrics.mean_absolute_error(prediction, true)

def accuracy(prediction, true):
  true = true.detach().numpy()
  prediction = prediction.detach().numpy()
  prediction = np.where( prediction <= 0.5, 0, 1)

  return (metrics.accuracy_score(prediction, true))

def precision(prediction, true):
  true = true.detach().numpy()
  prediction = prediction.detach().numpy()
  prediction = np.where( prediction <= 0.5, 0, 1)
  return (metrics.precision_score(prediction, true))

def recall(prediction, true):
  true = true.detach().numpy()
  prediction = prediction.detach().numpy()
  prediction = np.where( prediction <= 0.5, 0, 1)
  return (metrics.recall_score(prediction, true))

def tpr(prediction, true):
  return np.logical_and(prediction == 1, true== 1).sum()/prediction.shape[0]

def fpr(prediction, true):
  return np.logical_and(prediction == 1, true== 0).sum()/prediction.shape[0]

def tnr(prediction, true):
  return np.logical_and(prediction == 0, true== 0).sum()/prediction.shape[0]

def fnr(prediction, true):
  return np.logical_and(prediction == 0, true== 1).sum()/prediction.shape[0]

def test(model, x_test, y_test):
  x_test, y_test = torch.tensor(x_test), torch.tensor(y_test)
  outputs = model(x_test.float())
  loss = criterion(outputs, y_test.float())

  # log model results
  wandb.log({"test_loss": loss.item(), 
          "test_accuracy": accuracy(outputs, y_test), 
          "test_precision": precision(outputs, y_test), 
          "test_recall": recall(outputs, y_test)})
  wandb.watch(model)
  print ('Test Results, Loss: {:.4f},  Accuracy: {:.4f},  Precision: {:.4f},  Recall: {:.4f}' 
                  .format(loss.item(), 100 * accuracy(outputs, y_test),precision(outputs, y_test), recall(outputs, y_test)))



def train( model, criterion, optimizer, name,   lr,   x_train, x_val, y_train, y_val, a_train, a_val,  alpha = None, regularizers = None, num_epochs = None):
  x_train, x_val, y_train, y_val, a_train, a_val = torch.tensor(x_train), torch.tensor(x_val), torch.tensor(y_train), torch.tensor(y_val), torch.tensor(a_train), torch.tensor(a_val)

  config= {
      "learning_rate":lr,
      "epochs": num_epochs,
      "model":model.__class__.__name__,
      "criterion":criterion.__class__.__name__,
      "optimizer":optimizer.__class__.__name__,
      "train_ratio":train_ratio, 
      "test_ratio":test_ratio,
      "data":name,
    }
  
  if regularizers != None:
    for i in range(len(regularizers)):
      config['regularizer_'+str(i)] = regularizers[i]
      config['alpha_'+str(i)] = alpha[i]

  # # store the hyperparameters in weights and bias
  wandb.init(project=name, entity="mie424",config = config)

  epoch_loss_train = []
  epoch_accuracy_train = []
  epoch_precision_train = []
  epoch_recall_train = []

  epoch_loss_val = []
  epoch_accuracy_val = []
  epoch_precision_val = []
  epoch_recall_val = []

  # train loop
  for epoch in range(num_epochs):

    outputs = []
    # for i in range(0, x_train.shape[0], 10):
    optimizer.zero_grad()
  
    # x_i = x_train[i].float()

    # Forward pass
    outputs   = model(x_train.float())
      
        
    # outside of lop
    loss = criterion(outputs, y_train.float())
    if regularizers != None:
      # loss function
      for i in range(len(regularizers)):
        loss += alpha[i]*regularizers[i] 
    # Backward and optimize
    
    loss.backward()
    optimizer.step()

    
    # store the epoch
    epoch_loss_train += [loss.item()]
    epoch_accuracy_train += [accuracy(outputs, y_train)]
    epoch_precision_train += [precision(outputs, y_train)]
    epoch_recall_train += [recall(outputs, y_train)]

    # log model results
    wandb.log({"train_loss": epoch_loss_train[-1], 
              "train_accuracy": epoch_accuracy_train[-1], 
              "train_precision": epoch_precision_train[-1], 
              "train_recall": epoch_recall_train[-1], })
    
    wandb.watch(model)

    # print results
    print ('Epoch [{}/{}], Loss: {:.4f},  Accuracy: {:.4f},  Precision: {:.4f},  Recall: {:.4f}' 
                  .format(epoch+1, num_epochs, epoch_loss_train[-1], 100 * epoch_accuracy_train[-1],epoch_precision_train[-1],epoch_recall_train[-1]))
    
    # Validation loop
    with torch.no_grad():
      
      # model results
      outputs_v =  model(x_val.float())

      loss_v = criterion(outputs_v, y_val.float())
      if regularizers != None:
        # loss function
        for i in range(len(regularizers)):
          loss_v += alpha[i]*regularizers[i] 
      

    
      # store resulst for the pass through
      epoch_loss_val += [loss_v.item()]
      epoch_accuracy_val += [accuracy(outputs_v, y_val)]
      epoch_precision_val += [precision(outputs_v, y_val)]
      epoch_recall_val += [recall(outputs_v, y_val)]
      
      # log and print model results
      wandb.log({"val_loss": epoch_loss_val[-1], 
                  "val_accuracy": epoch_accuracy_val[-1], 
                  "val_precision": epoch_precision_val[-1], 
                  "val_recall": epoch_recall_val[-1], })
      wandb.watch(model)

      print('Accuracy of validation : {} % ,  Loss: {:.4f},  Precision: {:.4f},  Recall: {:.4f}'.format(100 * epoch_accuracy_val[-1], epoch_loss_val[-1],epoch_precision_val[-1], epoch_recall_val[-1])) 



wandb.login()
# get df
name = ""

df = get_dataset(name)

x_var, y_var, sensitive  = ['race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'v_decile_score', 'v_score_text'], ['two_year_recid'], ['race']

      #                        'sex', 'dob',
      #  'age',  'race', 'juv_fel_count', 'decile_score',
      #  'juv_misd_count', 'juv_other_count', 'priors_count',
      #  'days_b_screening_arrest',  'c_days_from_compas',
      #  'c_charge_degree', 'c_charge_desc',  'r_case_number',
      #  'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
      #  'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
      #  'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
      #  'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
      #  'v_type_of_assessment', 'v_decile_score', 'v_score_text',
      #   'in_custody', 'out_custody'],

x_var, y_var, sensitive = ['Gender' , 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 
        'Credit_History', 'Property_Area'], ['Loan_Status'], ['Gender']
# 'Loan_Amount_Term','LoanAmount',
train_ratio, test_ratio = 0.7, 0.1
# select x, y, and sensitive features
x = df[x_var].values 
y = df[y_var].values
sensitive_features = df[sensitive].values


# split 
x_train, x_test, x_val, y_train, y_test, y_val, a_train, a_test, a_val = split(x, y, sensitive_features, train_ratio, test_ratio)

num_epochs = 25
alpha = None
regularizers = None
input_dim =  x_train.shape[1]# Two inputs x1 and x2 
output_dim = 1 # Single binary output 
lr = 0.001

model = LogisticRegression(input_dim,output_dim)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

train(model, criterion, optimizer, name,   lr,   x_train, x_val, y_train, y_val, a_train, a_val,  alpha , regularizers, num_epochs)
test(model, x_test, y_test)
wandb.finish()


Epoch [1/25], Loss: 31.2211,  Accuracy: 29.8368,  Precision: 0.0099,  Recall: 0.6000
Accuracy of validation : 34.23913043478261 % ,  Loss: 23.0550,  Precision: 0.0420,  Recall: 0.4167
Epoch [2/25], Loss: 26.4370,  Accuracy: 30.5361,  Precision: 0.0331,  Recall: 0.6250
Accuracy of validation : 36.95652173913043 % ,  Loss: 18.6219,  Precision: 0.0840,  Recall: 0.5882
Epoch [3/25], Loss: 20.7026,  Accuracy: 34.7319,  Precision: 0.1060,  Recall: 0.7619
Accuracy of validation : 40.21739130434783 % ,  Loss: 13.3159,  Precision: 0.1513,  Recall: 0.6667
Epoch [4/25], Loss: 14.3459,  Accuracy: 41.9580,  Precision: 0.2318,  Recall: 0.8046
Accuracy of validation : 51.63043478260869 % ,  Loss: 9.1872,  Precision: 0.5126,  Recall: 0.6630
Epoch [5/25], Loss: 7.7079,  Accuracy: 51.0490,  Precision: 0.4801,  Recall: 0.7323
Accuracy of validation : 64.67391304347827 % ,  Loss: 28.4979,  Precision: 1.0000,  Recall: 0.6467
Epoch [6/25], Loss: 22.2872,  Accuracy: 70.3963,  Precision: 1.0000,  Recall: 0.70

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_accuracy,▁
test_loss,▁
test_precision,▁
test_recall,▁
train_accuracy,▁▁▂▃▅█████▄████▄███████▁█
train_loss,█▇▅▄▂▆▆▅▄▂▂▅▅▄▂▁▆▆▅▅▄▄▁▂▆
train_precision,▁▁▂▃▄█████▄████▃███████▁█
train_recall,▆▆██▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▁▇
val_accuracy,▁▂▂▅█████▄████▄███████▁██
val_loss,▆▅▄▂██▇▅▃▂▇▇▅▃▁█▇▇▇▆▅▁▁██

0,1
test_accuracy,1.0
test_loss,0.0
test_precision,1.0
test_recall,1.0
train_accuracy,0.70396
train_loss,24.80214
train_precision,1.0
train_recall,0.70396
val_accuracy,0.64674
val_loss,28.50871


In [None]:
x_train.shape[1]

12

In [None]:
# https://towardsdatascience.com/logistic-regression-with-pytorch-3c8bbea594be
class LogisticRegression(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
         super(LogisticRegression, self).__init__()
         self.linear = torch.nn.Linear(input_dim, output_dim)
     def forward(self, x):
         (self.linear) 

         outputs = torch.sigmoid(self.linear(x))
         return outputs

In [None]:
h =  torch.nn.Linear(input_dim, output_dim)

In [None]:
torch.sigmoid(h())

TypeError: ignored

In [None]:
d = torch.tensor(np.zeros((1, 12)))
d = d.to(torch.float32)


In [None]:
d.dtype

torch.float64

In [None]:
model = LogisticRegression(input_dim,output_dim)


In [None]:
outputs = model(d)

torch.Size([1, 12])
Linear(in_features=12, out_features=1, bias=True)
