In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [2]:
file_path = 'data/raw_data.csv' 
df = pd.read_csv(file_path)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

  df = pd.read_csv(file_path)


Shape: (2260701, 151)
Columns: ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_s

basic data clean

In [3]:
# data cleaning
print("Missing values before cleaning:")
print(df.isnull().sum().sort_values(ascending=False))


# 1 drop duplicates
df = df.drop_duplicates()

# 2 delete columns with more than 50% missing values
threshold = len(df) * 0.5
df_cleaned = df.dropna(thresh=threshold, axis=1)

print("Columns after dropping those with >50% missing values:", df_cleaned.shape)
print(" Remain Columns:", df_cleaned.columns.tolist())
lendingclub_cleaned.csv

Missing values before cleaning:
member_id                                     2260701
orig_projected_additional_accrued_interest    2252050
hardship_end_date                             2249784
hardship_start_date                           2249784
hardship_type                                 2249784
                                               ...   
policy_code                                        33
revol_bal                                          33
fico_range_high                                    33
fico_range_low                                     33
id                                                  0
Length: 151, dtype: int64
Columns after dropping those with >50% missing values: (2260701, 107)
 Remain Columns: ['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'purpose', 'title', '

divided basic cleaned dataset to multipe dataset  
  1 application_features  
  2 postloan_features

In [4]:
# 1. application features (used for loan approval model)
application_features = [
    'loan_amnt', 'term', 'int_rate', 'installment',
    'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc',
    'verification_status','issue_d','purpose', 'zip_code','addr_state',
    'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high',
    'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
    'initial_list_status', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
    'total_bal_il', 'il_util', 'max_bal_bc', 'all_util',
    'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths',
    'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
    'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
    'mort_acc', 'num_accts_ever_120_pd',
    'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl',
    'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
    'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
    'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens',
    'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit',
    'hardship_flag', 'disbursement_method', 'debt_settlement_flag'
]

# 2. post-loan features (used for post-loan monitoring)
postloan_features = [
    'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
    'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
    'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
    'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low'
]


# 3. useless features to drop
drop_features = [
    'id', 'member_id', 'pymnt_plan', 'url', 'policy_code',
    'title', 'desc', 'next_pymnt_d'
]
# 4 target variable 'default' based on 'loan_status'
# “Charged Off”、“Default”、“Late” = 1
def create_target(status):
    if status in ['Charged Off', 'Default', 'Late (31-120 days)', 'Does not meet the credit policy. Status:Charged Off']:
        return 1
    elif status in ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']:
        return 0
    else:
        return np.nan  # 其他状态视情况处理

df_cleaned['default'] = df_cleaned['loan_status'].apply(create_target)
df_cleaned = df_cleaned.dropna(subset=['default'])


X_application = df_cleaned[application_features].copy()
X_postloan = df_cleaned[postloan_features].copy()
y = df_cleaned['default'].copy()

print("application features", X_application.shape[1])
print("postloan features", X_postloan.shape[1])
print("target variable:", y.shape[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['default'] = df_cleaned['loan_status'].apply(create_target)


application features 74
postloan features 14
target variable: 1369566


split into train datasets, validation datasets, test datasets

In [5]:
# 20% of the data will be used for testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X_application, y, test_size=0.2, stratify=y, random_state=42)
# 80% of the training data will be used for training , 
# and 30% of the training data will be used for validation
X_train, X_val, y_train, y_val = train_test_split( X_train_val, y_train_val, test_size=0.3, stratify=y_train_val, random_state=42)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")


Train: (766956, 74), Validation: (328696, 74), Test: (273914, 74)


In [6]:

# save as CSV
X_train.to_csv('data/X_train.csv', index=False)
X_val.to_csv('data/X_val.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_val.to_csv('data/y_val.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)