In [1]:
from dask_jobqueue import SLURMCluster

# Compose SLURM script
cluster = SLURMCluster(queue='caslake', cores=10, memory='40GB', 
                       processes=10, walltime='01:00:00', interface='ib0',
                       job_extra=['--account=macs30123']
                      )

# Request resources
cluster.scale(jobs=1)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41265 instead


In [2]:
from dask.distributed import Client

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.25.0.65:41265/status,

0,1
Dashboard: http://172.25.0.65:41265/status,Workers: 10
Total threads: 10,Total memory: 37.30 GiB

0,1
Comm: tcp://172.25.0.65:33829,Workers: 10
Dashboard: http://172.25.0.65:41265/status,Total threads: 10
Started: Just now,Total memory: 37.30 GiB

0,1
Comm: tcp://172.25.2.82:38407,Total threads: 1
Dashboard: http://172.25.2.82:40715/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:36009,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-nttbsx1a,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-nttbsx1a

0,1
Comm: tcp://172.25.2.82:32867,Total threads: 1
Dashboard: http://172.25.2.82:33561/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:46447,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-x9xzgno5,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-x9xzgno5

0,1
Comm: tcp://172.25.2.82:42389,Total threads: 1
Dashboard: http://172.25.2.82:44633/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:44183,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-0n8ackem,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-0n8ackem

0,1
Comm: tcp://172.25.2.82:39021,Total threads: 1
Dashboard: http://172.25.2.82:46129/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:36959,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-f9yjwxcf,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-f9yjwxcf

0,1
Comm: tcp://172.25.2.82:33079,Total threads: 1
Dashboard: http://172.25.2.82:46339/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:46171,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-2hquqz3z,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-2hquqz3z

0,1
Comm: tcp://172.25.2.82:44223,Total threads: 1
Dashboard: http://172.25.2.82:35027/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:42545,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-5uw_vgq3,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-5uw_vgq3

0,1
Comm: tcp://172.25.2.82:35995,Total threads: 1
Dashboard: http://172.25.2.82:36751/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:40867,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-m4xc70in,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-m4xc70in

0,1
Comm: tcp://172.25.2.82:44915,Total threads: 1
Dashboard: http://172.25.2.82:44715/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:36485,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-41g1hj4a,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-41g1hj4a

0,1
Comm: tcp://172.25.2.82:45377,Total threads: 1
Dashboard: http://172.25.2.82:43007/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:34377,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-7riux5nz,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-7riux5nz

0,1
Comm: tcp://172.25.2.82:43017,Total threads: 1
Dashboard: http://172.25.2.82:37587/status,Memory: 3.73 GiB
Nanny: tcp://172.25.2.82:39095,
Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-u0uk8nv2,Local directory: /scratch/local/jobs/31050379/dask-worker-space/worker-u0uk8nv2


In [3]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split, IncrementalSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# ---------- Step 1: Load Data ----------
df = dd.read_parquet("data/hmda_filtered_2018_2023.parquet")

# ---------- Step 2: Prepare features ----------
features = [
    'loan_amount',
    'income',
    'debt_to_income_ratio',
    'applicant_age',
    'derived_race',
    'derived_ethnicity',
    'loan_type'
]



In [4]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,applicant_race_1,applicant_ethnicity_1,co_applicant_race_1,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status
0,2018,1,1.305,7822.87,135000,67,46,1,45-54,White,...,5,2,8,5,32,240,4.875,195000,1,1
1,2018,1,0.057,6859.57,235000,84,50%-60%,9,45-54,White,...,5,1,5,2,32,360,4.375,275000,1,1
2,2018,3,,,185000,45,>60%,3,35-44,White,...,5,2,5,2,32,360,,355000,1,1
3,2018,3,,,285000,78,42,1,35-44,White,...,5,1,8,5,31,360,,425000,1,1
4,2018,3,,,205000,36,>60%,1,25-34,White,...,5,1,8,5,32,360,,265000,1,1


In [5]:
df.columns

Index(['activity_year', 'action_taken', 'rate_spread', 'total_loan_costs',
       'loan_amount', 'income', 'debt_to_income_ratio',
       'applicant_credit_score_type', 'applicant_age', 'derived_race',
       'derived_ethnicity', 'tract_minority_population_percent',
       'tract_to_msa_income_percentage', 'derived_msa_md', 'state_code',
       'county_code', 'census_tract', 'lei', 'loan_type',
       'derived_loan_product_type', 'applicant_sex', 'applicant_race_1',
       'applicant_ethnicity_1', 'co_applicant_race_1',
       'co_applicant_ethnicity_1', 'loan_purpose', 'loan_term',
       'interest_rate', 'property_value', 'occupancy_type', 'lien_status'],
      dtype='object')