In [1]:
from dask_jobqueue import SLURMCluster

# Compose SLURM script
cluster = SLURMCluster(queue='caslake', cores=10, memory='40GB', 
                       processes=10, walltime='01:00:00', interface='ib0',
                       job_extra=['--account=macs30123']
                      )

# Request resources
cluster.scale(jobs=1)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 42857 instead


In [2]:
from dask.distributed import Client

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.25.0.66:42857/status,

0,1
Dashboard: http://172.25.0.66:42857/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.25.0.66:34861,Workers: 0
Dashboard: http://172.25.0.66:42857/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [12]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split, IncrementalSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# ---------- Step 1: Load Data ----------
df = dd.read_parquet("data/hmda_filtered_2018_2023.parquet")

In [7]:
# 计算每个取值的频率和比例
action_counts = df["action_taken"].value_counts().compute()
action_ratios = action_counts / action_counts.sum()

# 合并结果为一个 DataFrame
action_stats = action_counts.to_frame(name="count")
action_stats["proportion"] = action_ratios

# 排序并显示
print(action_stats.sort_index())

       count  proportion
-1      1982    0.000018
1   60929528    0.542287
2    2766900    0.024626
3   15486366    0.137832
4   14321841    0.127468
5    5212688    0.046394
6   12585802    0.112017
7     425657    0.003788
8     625819    0.005570


In [13]:
# 计算缺失比例
na_ratio = (df.isna().sum() / len(df)).compute().sort_values(ascending=False)

# 打印结果
print(na_ratio)

rate_spread                          0.465884
total_loan_costs                     0.450207
debt_to_income_ratio                 0.337127
interest_rate                        0.314909
property_value                       0.205317
income                               0.133752
census_tract                         0.021304
county_code                          0.018590
loan_term                            0.012748
state_code                           0.010645
applicant_ethnicity_1                0.000384
applicant_race_1                     0.000162
co_applicant_ethnicity_1             0.000122
co_applicant_race_1                  0.000053
derived_loan_product_type            0.000018
lei                                  0.000018
loan_purpose                         0.000000
occupancy_type                       0.000000
applicant_sex                        0.000000
activity_year                        0.000000
loan_type                            0.000000
action_taken                      

In [15]:
# 转为 float（防止 object 类型出错）
df["property_value"] = dd.to_numeric(df["property_value"], errors="coerce")
df["income"] = dd.to_numeric(df["income"], errors="coerce")

# 计算分位数（0, 10%, 25%, 50%, 75%, 90%, 100%）
quantiles = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]

print("Property Value Quantiles:")
print(df["property_value"].quantile(quantiles).compute())

print("\nIncome Quantiles:")
print(df["income"].quantile(quantiles).compute()) # income needs to drop outliers!

Property Value Quantiles:
0.00    5.000000e+03
0.10    1.750000e+05
0.25    2.550000e+05
0.50    4.050000e+05
0.75    6.450000e+05
0.90    3.305000e+06
1.00    2.147484e+09
Name: property_value, dtype: float64

Income Quantiles:
0.00      -899459.0
0.10           46.0
0.25           70.0
0.50          112.0
0.75          180.0
0.90         4500.0
1.00    717564000.0
Name: income, dtype: float64


In [18]:
def describe_column_distribution(df, column_name):
    """
    打印指定列的 min, max 和 quantiles 分布（使用 Dask）。
    
    参数：
    df : Dask DataFrame
    column_name : str，要查看的列名
    """

    if column_name not in df.columns:
        print(f"[ERROR] '{column_name}' not in DataFrame.\n")
        return

    print(f"\n==== {column_name} ====")

    # 转换为数值型
    df[column_name] = dd.to_numeric(df[column_name], errors="coerce")

    # 打印分位数
    quantiles = df[column_name].quantile([0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]).compute()
    print("\nQuantiles:")
    print(quantiles.round(2))

columns_to_check = [
    "income",
    "property_value",
    "total_loan_costs",
    "rate_spread",
    "loan_term",
    "loan_amount",
    "applicant_age"
]


In [None]:
describe_column_distribution(df, columns_to_check[0])  # 查看 income 分布



==== income ====


In [14]:
# 打印每个变量的数据类型
print(df.dtypes)

activity_year                        object
action_taken                         object
rate_spread                          object
total_loan_costs                     object
loan_amount                          object
income                               object
debt_to_income_ratio                 object
applicant_credit_score_type          object
applicant_age                        object
derived_race                         object
derived_ethnicity                    object
tract_minority_population_percent    object
tract_to_msa_income_percentage       object
derived_msa_md                       object
state_code                           object
county_code                          object
census_tract                         object
lei                                  object
loan_type                            object
derived_loan_product_type            object
applicant_sex                        object
applicant_race_1                     object
applicant_ethnicity_1           

In [None]:
num_vars = [
    "loan_amount", "income", "rate_spread",
    "total_loan_costs", "property_value"
]

for col in num_vars:
    df[col] = dd.to_numeric(df[col], errors="coerce")

In [None]:
# 要转换为分类变量的字段列表
cat_vars = [
    "applicant_credit_score_type",
    "applicant_age",
    "derived_race",
    "derived_ethnicity",
    "applicant_sex",
    "loan_type",
    "loan_purpose",
    "occupancy_type",
    "derived_loan_product_type"
]

# 批量转换为 category 类型
for col in cat_vars:
    if col in df.columns:
        df[col] = df[col].astype("category")

In [9]:
# 只保留需要的列，并去除缺失值
df = df[["action_taken", "derived_race"]].dropna()

# 1. 分组计数（修复 name= 报错）
grouped = df.groupby(["action_taken", "derived_race"]).size().reset_index()
grouped = grouped.rename(columns={0: "count"})

# 2. 按 action_taken 分组求总数，用于归一化比例
total_by_action = grouped.groupby("action_taken")["count"].transform(lambda x: x.sum())
grouped["proportion"] = grouped["count"] / total_by_action

# 3. 计算并打印
result = grouped.compute().sort_values(["action_taken", "proportion"], ascending=[True, False])
print(result)

  Before: .transform(func)
  After:  .transform(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .transform(func, meta=('x', 'f8'))            for series result
  total_by_action = grouped.groupby("action_taken")["count"].transform(lambda x: x.sum())


   action_taken                               derived_race     count  \
70           -1                                      Joint      1982   
8             1                                      White  42180456   
7             1                         Race Not Available   9795446   
2             1                                      Asian   3642535   
3             1                  Black or African American   3548809   
..          ...                                        ...       ...   
57            8                                      Joint     14524   
54            8           American Indian or Alaska Native      3551   
68            8                   2 or more minority races      1191   
58            8  Native Hawaiian or Other Pacific Islander      1010   
69            8                        Free Form Text Only        71   

    proportion  
70    1.000000  
8     0.692283  
7     0.160767  
2     0.059783  
3     0.058244  
..         ...  
57    0.023208  

In [10]:
result.to_csv("action_by_race.csv")

In [11]:

# 1. 分组计数
grouped = df.groupby(["derived_race", "action_taken"]).size().reset_index()
grouped = grouped.rename(columns={0: "count"})

# 2. 计算组内总数用于归一化
total_by_race = grouped.groupby("derived_race")["count"].transform(lambda x: x.sum())
grouped["proportion"] = grouped["count"] / total_by_race

# 3. 计算并排序打印
result = grouped.compute().sort_values(["derived_race", "proportion"], ascending=[True, False])
result.to_csv("race_by_action.csv")

  Before: .transform(func)
  After:  .transform(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .transform(func, meta=('x', 'f8'))            for series result
  total_by_race = grouped.groupby("derived_race")["count"].transform(lambda x: x.sum())
