In [223]:
from dask_cloudprovider.aws import EC2Cluster 
from dask.distributed import Client
import configparser
import os
import contextlib
import re
import dask
from platform import python_version

def get_aws_credentials():
    """Read in your AWS credentials file and convert to environment variables."""
    parser = configparser.RawConfigParser()
    
    parser.read(os.path.expanduser('~/.aws/config'))
    config = parser.items('default')
    
    parser.read(os.path.expanduser('~/.aws/credentials'))
    credentials = parser.items('default')
    
    all_credentials = {key.upper(): value for key, value in [*config, *credentials]}
    with contextlib.suppress(KeyError):
        all_credentials["AWS_REGION"] = all_credentials.pop("REGION")
        
    return all_credentials

# Pass in AWS Credentials + any extra packages you would like to install on cluster via `pip`
env_vars = get_aws_credentials()
env_vars["EXTRA_PIP_PACKAGES"] = "s3fs"

# Select software installed on scheduler + worker instances based on client Python + Dask versions
# versions need to match across client, scheduler, worker -- slight mismatches are OK, though
py_v = '-py' + re.findall(r'\d{1}.\d+', python_version())[0]
dask_docker_tag = f"daskdev/dask:{dask.__version__ + py_v}"
print('Docker Image: ', dask_docker_tag)

# launch a cluster of 5 r5.large instances (10 vCPUs):
# 1 scheduler
# 4 workers (2 threads + 16 GB RAM each)
cluster = EC2Cluster(instance_type='r5.large',
                     n_workers=4,
                     security=False,
                     docker_image=dask_docker_tag,
                     env_vars=env_vars
)

Docker Image:  daskdev/dask:2025.5.1-py3.11
Creating scheduler instance
Created instance i-0c7dcafb216c8344c as dask-11b14be3-scheduler
Waiting for scheduler to run at 52.23.205.43:8786
Scheduler is running


  next(self.gen)


Creating worker instance
Creating worker instance
Creating worker instance
Creating worker instance
Created instance i-0a8de685fac4488f2 as dask-11b14be3-worker-024e464a
Created instance i-0e7ba932274b2b956 as dask-11b14be3-worker-dd88cf88
Created instance i-0e0735915c43a4c06 as dask-11b14be3-worker-866640ba
Created instance i-074a87942eaeaeef8 as dask-11b14be3-worker-c315b87e


Task exception was never retrieved
future: <Task finished name='Task-12953' coro=<Client._gather.<locals>.wait() done, defined at /Users/nancy/.pyenv/versions/3.11.8/lib/python3.11/site-packages/distributed/client.py:2377> exception=AllExit()>
Traceback (most recent call last):
  File "/Users/nancy/.pyenv/versions/3.11.8/lib/python3.11/site-packages/distributed/client.py", line 2386, in wait
    raise AllExit()
distributed.client.AllExit


In [224]:
client = Client(cluster)
client # note that slight mismatches between client, scheduler, and worker software are fine


+---------+----------------+-----------------+---------+
| Package | Client         | Scheduler       | Workers |
+---------+----------------+-----------------+---------+
| lz4     | None           | 4.4.4           | None    |
| python  | 3.11.8.final.0 | 3.11.12.final.0 | None    |
| toolz   | 1.0.0          | 0.12.0          | None    |
+---------+----------------+-----------------+---------+


0,1
Connection method: Cluster object,Cluster type: dask_cloudprovider.EC2Cluster
Dashboard: http://52.23.205.43:8787/status,

0,1
Dashboard: http://52.23.205.43:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.31.89.191:8786,Workers: 0
Dashboard: http://172.31.89.191:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [189]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split, IncrementalSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

df = dd.read_parquet('s3://nancyfinal/hmda_filtered_2018_2023.parquet')

In [107]:
# 计算每个取值的频率和比例
action_counts = df["action_taken"].value_counts().compute()
action_ratios = action_counts / action_counts.sum()

# 合并结果为一个 DataFrame
action_stats = action_counts.to_frame(name="count")
action_stats["proportion"] = action_ratios

# 排序并显示
print(action_stats.sort_index())

                 count  proportion
action_taken                      
-1                1982    0.000018
1             60929528    0.542287
2              2766900    0.024626
3             15486366    0.137832
4             14321841    0.127468
5              5212688    0.046394
6             12585802    0.112017
7               425657    0.003788
8               625819     0.00557


In [108]:
# 计算缺失比例
na_ratio = (df.isna().sum() / len(df)).compute().sort_values(ascending=False)

# 打印结果
print(na_ratio)

rate_spread                          0.465884
total_loan_costs                     0.450207
debt_to_income_ratio                 0.337127
interest_rate                        0.314909
property_value                       0.205317
income                               0.133752
census_tract                         0.021304
county_code                          0.018590
loan_term                            0.012748
state_code                           0.010645
applicant_ethnicity_1                0.000384
applicant_race_1                     0.000162
co_applicant_ethnicity_1             0.000122
co_applicant_race_1                  0.000053
derived_loan_product_type            0.000018
lei                                  0.000018
loan_purpose                         0.000000
occupancy_type                       0.000000
applicant_sex                        0.000000
activity_year                        0.000000
loan_type                            0.000000
action_taken                      

In [190]:
# 转为 float（防止 object 类型出错）
df["property_value"] = dd.to_numeric(df["property_value"], errors="coerce").astype("float64")
df["income"] = dd.to_numeric(df["income"], errors="coerce").astype("float64")

In [153]:


# 计算分位数（0, 10%, 25%, 50%, 75%, 90%, 100%）
quantiles = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]

print("Property Value Quantiles:")
print(df["property_value"].quantile(quantiles).compute())

print("\nIncome Quantiles:")
print(df["income"].quantile(quantiles).compute()) # income needs to drop outliers!

Property Value Quantiles:
0.00    5.000000e+03
0.10    1.650000e+05
0.25    2.450000e+05
0.50    3.850000e+05
0.75    5.950000e+05
0.90    9.350000e+05
1.00    2.147484e+09
Name: property_value, dtype: float64

Income Quantiles:
0.00      -899459.0
0.10           43.0
0.25           65.0
0.50          100.0
0.75          160.0
0.90          260.0
1.00    717564000.0
Name: income, dtype: float64


In [21]:
num_vars = [
    "income",
    "property_value",
    "total_loan_costs",
    "rate_spread",
    "loan_term",
    "loan_amount",
    'interest_rate',
    "combined_loan_to_value_ratio"
]

In [22]:
def describe_column_distribution(df, column_name):
    """
    打印指定列的 min, max 和 quantiles 分布（使用 Dask）。
    
    参数：
    df : Dask DataFrame
    column_name : str，要查看的列名
    """

    if column_name not in df.columns:
        print(f"[ERROR] '{column_name}' not in DataFrame.\n")
        return

    print(f"\n==== {column_name} ====")

    # 转换为数值型
    df[column_name] = dd.to_numeric(df[column_name], errors="coerce").astype("float64")

    # 打印分位数
    quantiles = df[column_name].quantile([0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]).compute()
    print("\nQuantiles:")
    print(quantiles.round(2))


In [191]:
describe_column_distribution(df, num_vars[2]) # total_loan_costs



==== total_loan_costs ====

Quantiles:
0.00    0.000000e+00
0.10    2.158510e+03
0.25    3.610000e+03
0.50    5.665000e+03
0.75    8.824810e+03
0.90    1.609362e+04
1.00    4.256004e+09
Name: total_loan_costs, dtype: float64


In [192]:
describe_column_distribution(df, num_vars[3]) # rate_spread


==== rate_spread ====

Quantiles:
0.00   -9999997.00
0.10         -0.20
0.25          0.14
0.50          0.52
0.75          1.07
0.90          2.80
1.00      44562.00
Name: rate_spread, dtype: float64


In [193]:
describe_column_distribution(df, num_vars[4]) # loan_term


==== loan_term ====

Quantiles:
0.00         1.0
0.10       240.0
0.25       360.0
0.50       360.0
0.75       360.0
0.90       360.0
1.00    715180.0
Name: loan_term, dtype: float64


In [194]:
describe_column_distribution(df, num_vars[5])  # loan_amount


==== loan_amount ====

Quantiles:
0.00   -1.396245e+09
0.10    8.500000e+04
0.25    1.550000e+05
0.50    2.450000e+05
0.75    3.850000e+05
0.90    5.950000e+05
1.00    5.000006e+11
Name: loan_amount, dtype: float64


In [195]:
describe_column_distribution(df, num_vars[6])  # interest_rate


==== interest_rate ====

Quantiles:
0.00         0.00
0.10         2.75
0.25         3.25
0.50         4.00
0.75         5.60
0.90         9.25
1.00    795000.00
Name: interest_rate, dtype: float64


In [196]:
# 计算每个取值的频率和比例
msa_counts = df["derived_msa_md"].value_counts().compute()
msa_ratios = msa_counts / msa_counts.sum()

# 合并结果为一个 DataFrame
msa_stats = msa_counts.to_frame(name="count")
msa_stats["proportion"] = msa_ratios

In [None]:
# 排序并显示
print(msa_stats.sort_index())

                   count  proportion
derived_msa_md                      
0                 701122     0.00624
10180              46446    0.000413
10380              11085    0.000099
10420             235805    0.002099
10500              31822    0.000283
...                  ...         ...
49620             160059    0.001425
49660             146160    0.001301
49700              68004    0.000605
49740              68994    0.000614
99999           12669924    0.112765

[419 rows x 2 columns]


In [197]:
# Step 1: Convert derived_msa_md to 5-digit string (with leading zeros)
# This ensures consistency across codes like "1010" → "01010"
df["derived_msa_md"] = df["derived_msa_md"].astype("Int64").astype(str).str.zfill(5)

# Step 2: Define invalid codes according to HMDA documentation
# "00000" = missing / invalid
# Keep "99999" = not in an MSA
invalid_codes = {"00000"}

# Step 3: Replace invalid codes with NaN, keep valid MSA/MD codes
# .isin returns a boolean mask; .where keeps only valid codes
df["msa_md_clean"] = df["derived_msa_md"].where(~df["derived_msa_md"].isin(invalid_codes), np.nan)

In [134]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,applicant_ethnicity_1,co_applicant_race_1,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status,msa_md_clean
0,2018,1,1.305,7822.87,135000.0,67.0,46,1,45-54,White,...,2,8,5,32,240.0,4.875,195000.0,1,1,26420
1,2018,1,0.057,6859.57,235000.0,84.0,50%-60%,9,45-54,White,...,1,5,2,32,360.0,4.375,275000.0,1,1,41180
2,2018,3,,,185000.0,45.0,>60%,3,35-44,White,...,2,5,2,32,360.0,,355000.0,1,1,14260
3,2018,3,,,285000.0,78.0,42,1,35-44,White,...,1,8,5,31,360.0,,425000.0,1,1,33124
4,2018,3,,,205000.0,36.0,>60%,1,25-34,White,...,1,8,5,32,360.0,,265000.0,1,1,36740


In [198]:
# Remove uninformative or unavailable values in app_ethnicity
df = df[~df["applicant_ethnicity_1"].isin([
    "3", 
    "4"
])]

# Remove uninformative or unavailable values in app_race
df = df[~df["applicant_race_1"].isin([
    "6", 
    "7"
])]


In [96]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,applicant_ethnicity_1,co_applicant_race_1,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status,msa_md_clean
0,2018,1,1.305,7822.87,135000.0,67.0,46,1,45-54,White,...,2,8,5,32,240.0,4.875,195000.0,1,1,26420
1,2018,1,0.057,6859.57,235000.0,84.0,50%-60%,9,45-54,White,...,1,5,2,32,360.0,4.375,275000.0,1,1,41180
2,2018,3,,,185000.0,45.0,>60%,3,35-44,White,...,2,5,2,32,360.0,,355000.0,1,1,14260
3,2018,3,,,285000.0,78.0,42,1,35-44,White,...,1,8,5,31,360.0,,425000.0,1,1,33124
4,2018,3,,,205000.0,36.0,>60%,1,25-34,White,...,1,8,5,32,360.0,,265000.0,1,1,36740


In [199]:
# print type of loan_purpose
print(df["loan_purpose"].dtype)

string


In [200]:
# Make sure loan_purpose is numeric for comparison
df["loan_purpose"] = dd.to_numeric(df["loan_purpose"], errors="coerce")

# Create a binary flag: refinance (includes both 31 and 32)
df["is_refinance"] = df["loan_purpose"].isin(['31', '32']).astype(int)

# Create a binary flag: cash-out refinance only (code 32)
df["is_cash_out"] = (df["loan_purpose"] == '32').astype(int)


In [201]:
# outlier drops

# Drop rows with negative income
df = df[df["income"] >= 0]


In [202]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status,msa_md_clean,is_refinance,is_cash_out
0,2018,1,1.305,7822.87,135000.0,67.0,46,1,45-54,White,...,5,32,240.0,4.875,195000.0,1,1,26420,0,0
1,2018,1,0.057,6859.57,235000.0,84.0,50%-60%,9,45-54,White,...,2,32,360.0,4.375,275000.0,1,1,41180,0,0
2,2018,3,,,185000.0,45.0,>60%,3,35-44,White,...,2,32,360.0,,355000.0,1,1,14260,0,0
3,2018,3,,,285000.0,78.0,42,1,35-44,White,...,5,31,360.0,,425000.0,1,1,33124,0,0
4,2018,3,,,205000.0,36.0,>60%,1,25-34,White,...,5,32,360.0,,265000.0,1,1,36740,0,0


In [203]:
# print type of loan_purpose
print(df["action_taken"].dtype)

string


In [204]:
# Drop rows with invalid action_taken code (-1)
df = df[df["action_taken"] != "-1"]

In [205]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status,msa_md_clean,is_refinance,is_cash_out
0,2018,1,1.305,7822.87,135000.0,67.0,46,1,45-54,White,...,5,32,240.0,4.875,195000.0,1,1,26420,0,0
1,2018,1,0.057,6859.57,235000.0,84.0,50%-60%,9,45-54,White,...,2,32,360.0,4.375,275000.0,1,1,41180,0,0
2,2018,3,,,185000.0,45.0,>60%,3,35-44,White,...,2,32,360.0,,355000.0,1,1,14260,0,0
3,2018,3,,,285000.0,78.0,42,1,35-44,White,...,5,31,360.0,,425000.0,1,1,33124,0,0
4,2018,3,,,205000.0,36.0,>60%,1,25-34,White,...,5,32,360.0,,265000.0,1,1,36740,0,0


In [206]:
# Remove not normal outliers in rate-spread
df = df[~((df["rate_spread"] <= -30) | (df["rate_spread"] >= 30))]

In [207]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status,msa_md_clean,is_refinance,is_cash_out
0,2018,1,1.305,7822.87,135000.0,67.0,46,1,45-54,White,...,5,32,240.0,4.875,195000.0,1,1,26420,0,0
1,2018,1,0.057,6859.57,235000.0,84.0,50%-60%,9,45-54,White,...,2,32,360.0,4.375,275000.0,1,1,41180,0,0
2,2018,3,,,185000.0,45.0,>60%,3,35-44,White,...,2,32,360.0,,355000.0,1,1,14260,0,0
3,2018,3,,,285000.0,78.0,42,1,35-44,White,...,5,31,360.0,,425000.0,1,1,33124,0,0
4,2018,3,,,205000.0,36.0,>60%,1,25-34,White,...,5,32,360.0,,265000.0,1,1,36740,0,0


In [208]:
df.columns

Index(['activity_year', 'action_taken', 'rate_spread', 'total_loan_costs',
       'loan_amount', 'income', 'debt_to_income_ratio',
       'applicant_credit_score_type', 'applicant_age', 'derived_race',
       'derived_ethnicity', 'tract_minority_population_percent',
       'tract_to_msa_income_percentage', 'derived_msa_md', 'state_code',
       'county_code', 'census_tract', 'lei', 'loan_type',
       'derived_loan_product_type', 'applicant_sex', 'applicant_race_1',
       'applicant_ethnicity_1', 'co_applicant_race_1',
       'co_applicant_ethnicity_1', 'loan_purpose', 'loan_term',
       'interest_rate', 'property_value', 'occupancy_type', 'lien_status',
       'msa_md_clean', 'is_refinance', 'is_cash_out'],
      dtype='object')

In [209]:
# Drop rows with invalid (negative) loan amounts
df = df[df["loan_amount"] >= 0]


In [210]:
df.head()

Unnamed: 0,activity_year,action_taken,rate_spread,total_loan_costs,loan_amount,income,debt_to_income_ratio,applicant_credit_score_type,applicant_age,derived_race,...,co_applicant_ethnicity_1,loan_purpose,loan_term,interest_rate,property_value,occupancy_type,lien_status,msa_md_clean,is_refinance,is_cash_out
0,2018,1,1.305,7822.87,135000.0,67.0,46,1,45-54,White,...,5,32,240.0,4.875,195000.0,1,1,26420,0,0
1,2018,1,0.057,6859.57,235000.0,84.0,50%-60%,9,45-54,White,...,2,32,360.0,4.375,275000.0,1,1,41180,0,0
2,2018,3,,,185000.0,45.0,>60%,3,35-44,White,...,2,32,360.0,,355000.0,1,1,14260,0,0
3,2018,3,,,285000.0,78.0,42,1,35-44,White,...,5,31,360.0,,425000.0,1,1,33124,0,0
4,2018,3,,,205000.0,36.0,>60%,1,25-34,White,...,5,32,360.0,,265000.0,1,1,36740,0,0


In [211]:
# Step 1: Convert combined_loan_to_value_ratio to numeric
df["combined_loan_to_value_ratio"] = df["loan_amount"] / df["property_value"]

# Step 3: Compute fee_shares = total_loan_costs / loan_amount
df["fee_shares"] = df["total_loan_costs"] / df["loan_amount"]

In [212]:
df = df[df["action_taken"].isin(['1', '2', '3', '7', '8'])]
df["is_rejected"] = df["action_taken"].map_partitions(lambda s: s.isin(['3', '7']).astype(int))

In [213]:
# 要转换为分类变量的字段列表
cat_vars = [
    "applicant_age",
]

# 批量转换为 category 类型
for col in cat_vars:
    if col in df.columns:
        df[col] = df[col].astype("category")

In [214]:
# hoepa_status

In [215]:
# Keep only the selected variables
selected_columns = [
    "activity_year", "rate_spread", "applicant_age", "is_rejected", "total_loan_costs", "loan_amount",
    "combined_loan_to_value_ratio", "fee_shares", "income", "is_refinance", "is_cash_out", 'msa_md_clean',
    "loan_purpose", "state_code", "county_code", "census_tract", 
    "loan_term", "property_value", "interest_rate", 'tract_to_msa_income_percentage', 'applicant_sex'
]
df = df[selected_columns]

In [216]:
print(df.dtypes)


activity_year                     string[pyarrow]
rate_spread                               float64
applicant_age                            category
is_rejected                                 int64
total_loan_costs                          float64
loan_amount                               float64
combined_loan_to_value_ratio              float64
fee_shares                                float64
income                                    float64
is_refinance                                int64
is_cash_out                                 int64
msa_md_clean                               object
loan_purpose                                Int64
state_code                        string[pyarrow]
county_code                       string[pyarrow]
census_tract                      string[pyarrow]
loan_term                                 float64
property_value                            float64
interest_rate                             float64
tract_to_msa_income_percentage    string[pyarrow]


In [217]:
df.head()

Unnamed: 0,activity_year,rate_spread,applicant_age,is_rejected,total_loan_costs,loan_amount,combined_loan_to_value_ratio,fee_shares,income,is_refinance,...,msa_md_clean,loan_purpose,state_code,county_code,census_tract,loan_term,property_value,interest_rate,tract_to_msa_income_percentage,applicant_sex
0,2018,1.305,45-54,0,7822.87,135000.0,0.692308,0.057947,67.0,0,...,26420,32,TX,48201,48201342300,240.0,195000.0,4.875,79.0,2
1,2018,0.057,45-54,0,6859.57,235000.0,0.854545,0.02919,84.0,0,...,41180,32,MO,29183,29183311735,360.0,275000.0,4.375,129.0,2
2,2018,,35-44,1,,185000.0,0.521127,,45.0,0,...,14260,32,ID,16027,16027021100,360.0,355000.0,,91.0,2
3,2018,,35-44,1,,285000.0,0.670588,,78.0,0,...,33124,31,FL,12086,12086019800,360.0,425000.0,,149.0,2
4,2018,,25-34,1,,205000.0,0.773585,,36.0,0,...,36740,32,FL,12095,12095014400,360.0,265000.0,,114.0,2


In [220]:
print(len(df)) 

61596716


In [221]:
df.to_parquet('s3://nancyfinal/hmda_filtered_cleaned_2018_2023/', write_index=False)

In [2]:
cluster.close()

NameError: name 'cluster' is not defined

In [1]:
import boto3
import os

def download_s3_folder(bucket_name, s3_prefix, local_dir):
    """
    下载指定 S3 文件夹到本地目录
    """
    s3 = boto3.client('s3')

    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix)

    for page in pages:
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('/'):
                continue  # 跳过空目录
            local_path = os.path.join(local_dir, os.path.relpath(key, s3_prefix))
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            print(f"Downloading {key} to {local_path}")
            s3.download_file(bucket_name, key, local_path)

# ====== 参数配置 ======
bucket = 'nancyfinal'
prefix = 'hmda_filtered_cleaned_2018_2023/'
destination = './hmda_local_copy'

download_s3_folder(bucket, prefix, destination)
print("✅ 所有文件下载完成！")


Downloading hmda_filtered_cleaned_2018_2023/part.0.parquet to ./hmda_local_copy/part.0.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.1.parquet to ./hmda_local_copy/part.1.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.10.parquet to ./hmda_local_copy/part.10.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.100.parquet to ./hmda_local_copy/part.100.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.101.parquet to ./hmda_local_copy/part.101.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.102.parquet to ./hmda_local_copy/part.102.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.103.parquet to ./hmda_local_copy/part.103.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.104.parquet to ./hmda_local_copy/part.104.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.105.parquet to ./hmda_local_copy/part.105.parquet
Downloading hmda_filtered_cleaned_2018_2023/part.106.parquet to ./hmda_local_copy/part.106.parquet
Downloading hmda_fil