In [2]:
import re
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import datetime
import warnings

import torch
from torch.optim import Adagrad
from sklearn.metrics import log_loss, roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from utils import find_outliers_by_3segama


# sys.path.append("/Users/wzq/Desktop/game")
# from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
# from deepctr_torch.models import *
# from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint

# jupyter配置
from IPython.display import display
pd.options.display.max_rows=10000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=100000#Notebook 的一个cell的显示列数
pd.set_option('display.max_colwidth', None)

In [12]:
data_train = pd.read_csv("./data/train.csv")
data_test = pd.read_csv("./data/testA.csv")
# data = pd.concat([data_train,data_test], axis=0)
print("train_size:", data_train.shape,
      "test_size:", data_test.shape,)

number_cols = ["loanAmnt", "term", "interestRate", "installment", "annualIncome",
               "dti", "delinquency_2years", "ficoRangeLow", "ficoRangeHigh", "openAcc", "pubRec", 
               "pubRecBankruptcies", "revolBal", "revolUtil", "totalAcc", "employmentLength"] + ["n{}".format(i) for i in range(15)]
time_cols = ["issueDate", "earliesCreditLine"]
sparse_features = ['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', "postCode", 'regionCode','employmentTitle', "title"]

# 去掉 'years' 或 'year' 字样，处理 '10+ years'
def clean_experience(value):
    if pd.isna(value):  # 处理 NaN
        return np.nan
    value = value.replace('years', '').replace('year', '').strip()
    if '+' in value:  # 处理 10+ years
        return int(value.replace('+', ''))
    if '<' in value:
        return 0
    return int(value)

for data in [data_train, data_test]:
    data['employmentLength'] = data['employmentLength'].apply(clean_experience)
    # 处理空值
    # data[number_cols] = data[number_cols].fillna(0)
    # data[category_cols] = data[category_cols].fillna(-1)
    #按照中位数填充数值型特征
    data[number_cols] = data[number_cols].fillna(data[number_cols].median())
    #按照众数填充类别型特征
    data[sparse_features] = data[sparse_features].fillna(data[sparse_features].mode().iloc[0])

    # 时间处理
    data['issueDate'] = pd.to_datetime(data['issueDate'])
    # 拆分特征
    data['issueDate_year'] = data['issueDate'].dt.year
    data['issueDate_month'] = data['issueDate'].dt.month
    data['issueDate_weekday'] = data['issueDate'].dt.weekday  # 周一为0，周日为6
    data['issueDate_is_weekend'] = (data['issueDate_weekday'] >= 5).astype(int)

    data["earliesCreditLine"] = pd.to_datetime(data["earliesCreditLine"], format="%b-%Y")
    data["earliesCreditLine_year"] = data["earliesCreditLine"].dt.year
    data["earliesCreditLine_month"] = data["earliesCreditLine"].dt.month
    data["earliesCreditLine_diff_days"] = (pd.to_datetime('20250101', format='%Y%m%d') - data["earliesCreditLine"]).dt.days
    data["earliesCreditLine_diff_month"] = (pd.to_datetime('20250101', format='%Y%m%d') - data["earliesCreditLine"]).dt.days / 30
    data["earliesCreditLine_diff_year"] = (pd.to_datetime('20250101', format='%Y%m%d') - data["earliesCreditLine"]).dt.days / 365
    
    display(data.head(5))
    display("空值和时间类型数据处理done:", data.shape)


    # 编码
    # 对稀疏的类别特征进行简单的处理，处理为连续整数类别
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # # 类型数在2之上，又不是高维稀疏的,且纯分类特征
    # data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True, dtype=np.uint8)
    # # 高维类别特征需要进行转换
    # for f in ['employmentTitle', 'postCode', 'title']:
    #     data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    #     data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    #     del data[f]
    display("对类别数据进行编码 done", data.head(5))

    # 特征交叉
    # df['grade'] / df.groupby([item])['grade'].transform('mean'): 通过将 grade 列的每个值除以其所在组的均值，得到新的标准化特征。这相当于对每个借款人，计算其 grade 与同组其他借款人 grade 的相对差异。具体来说：
    for feat in ["n{}".format(i) for i in range(15)] + ["issueDate_year", "issueDate_month", "issueDate_weekday", "issueDate_is_weekend","earliesCreditLine_year", "earliesCreditLine_month"]:
        data['grade_to_mean_' + feat] = data['grade'] / data.groupby([feat])['grade'].transform('mean')
        data['grade_to_std_' + feat] = data['grade'] / data.groupby([feat])['grade'].transform('std')
    print("特征交叉 done")

# # 类别字段
# for col in sparse_features: 
#     temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
#     temp_dict.index = temp_dict[col].values
#     temp_dict = temp_dict[col + '_target_mean'].to_dict()

#     data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
#     data_test[col + '_target_mean'] = data_test[col].map(temp_dict)

data = pd.concat([data_train,data_test], ignore_index=True)
print("train_size:", data_train.shape,
      "test_size:", data_test.shape,
      "data_process_size:", data.shape)


# 删除无用数据
data = data.drop(["id", "earliesCreditLine", "policyCode", "issueDate"], axis=1)
display("删除无用特征 done: ", data.shape)

# 删除异常数据（基于3个标准差）【删除】
# features = [f for f in data.columns if f not in ["isDefault"]]
# train_data = data.iloc[:800000, :]
# test_data = data.iloc[800000:, :]
# for fea in features:
#     train_data = find_outliers_by_3segama(train_data,fea)
#     # train_data = train_data[train_data[fea+'_outliers']=='正常值']
#     # train_data = train_data.reset_index(drop=True)
#     # train_data = train_data.drop(columns=[fea+'_outliers'])
# # train_data = train_data.reset_index(drop=True)
# data_process = pd.concat([train_data,test_data], ignore_index=True)

# print("process done \n",
#       "train_size:", train_data.shape,
#       "test_size:", test_data.shape,
#       "data_process_size:", data_process.shape)



train_size: (800000, 47) test_size: (200000, 46)


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month,earliesCreditLine_diff_days,earliesCreditLine_diff_month,earliesCreditLine_diff_year
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2014,7,1,0,2001,8,8554,285.133333,23.435616
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0,2012,8,2,0,2002,5,8281,276.033333,22.687671
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,2015,10,3,0,2006,5,6820,227.333333,18.684932
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2015,8,5,1,1999,5,9377,312.566667,25.690411
4,4,3000.0,3,12.99,101.07,C,C2,54.0,6.0,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,2016,3,1,0,1977,8,17320,577.333333,47.452055


'空值和时间类型数据处理done:'

(800000, 56)

'对类别数据进行编码 done'

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month,earliesCreditLine_diff_days,earliesCreditLine_diff_month,earliesCreditLine_diff_year
0,0,35000.0,5,19.52,917.97,4,21,292,2.0,2,110000.0,2,2014-07-01,1,1,137,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2014,7,1,0,2001,8,8554,285.133333,23.435616
1,1,18000.0,5,18.49,461.9,3,16,149667,5.0,0,46000.0,2,2012-08-01,0,0,156,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1394,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0,2012,8,2,0,2002,5,8281,276.033333,22.687671
2,2,12000.0,5,16.99,298.17,3,17,25137,8.0,0,74000.0,2,2015-10-01,0,0,337,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,2015,10,3,0,2006,5,6820,227.333333,18.684932
3,3,11000.0,3,7.26,340.96,0,3,36207,10.0,1,118000.0,1,2015-08-01,0,4,148,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2015,8,5,1,1999,5,9377,312.566667,25.690411
4,4,3000.0,3,12.99,101.07,2,11,48,6.0,1,29000.0,2,2016-03-01,0,10,301,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,2016,3,1,0,1977,8,17320,577.333333,47.452055


特征交叉 done


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month,earliesCreditLine_diff_days,earliesCreditLine_diff_month,earliesCreditLine_diff_year
0,800000,14000.0,3,10.99,458.28,B,B3,7027.0,10.0,0,80000.0,0,2014-07-01,0,163.0,21,10.56,1.0,715.0,719.0,17.0,0.0,0.0,9846.0,30.7,29.0,0,0,1974-11-01,0.0,1.0,1.0,4.0,6.0,6.0,6.0,8.0,4.0,15.0,19.0,6.0,17.0,0.0,0.0,1.0,3.0,2014,7,1,0,1974,11,18324,610.8,50.20274
1,800001,20000.0,5,14.65,472.14,C,C5,60426.0,10.0,0,50000.0,0,2015-07-01,2,235.0,8,21.4,2.0,670.0,674.0,5.0,0.0,0.0,8946.0,56.6,14.0,0,0,2001-07-01,5.0,1.0,2.0,1.0,3.0,3.0,1.0,1.0,3.0,3.0,9.0,3.0,5.0,0.0,0.0,2.0,2.0,2015,7,2,0,2001,7,8585,286.166667,23.520548
2,800002,12000.0,3,19.99,445.91,D,D4,23547.0,2.0,1,60000.0,2,2016-10-01,0,526.0,20,33.5,0.0,710.0,714.0,12.0,0.0,0.0,970.0,17.6,43.0,1,0,2006-08-01,0.0,1.0,0.0,1.0,4.0,4.0,1.0,1.0,36.0,5.0,6.0,4.0,12.0,0.0,0.0,0.0,7.0,2016,10,5,1,2006,8,6728,224.266667,18.432877
3,800003,17500.0,5,14.31,410.02,C,C4,636.0,4.0,0,37000.0,1,2014-11-01,4,248.0,11,13.95,0.0,685.0,689.0,10.0,1.0,1.0,10249.0,52.3,18.0,0,0,2002-07-01,4.0,1.0,0.0,2.0,2.0,2.0,4.0,7.0,2.0,8.0,14.0,2.0,10.0,0.0,0.0,0.0,3.0,2014,11,5,1,2002,7,8220,274.0,22.520548
4,800004,35000.0,3,17.09,1249.42,D,D1,368446.0,0.0,1,80000.0,1,2017-10-01,0,115.0,8,24.97,0.0,685.0,689.0,19.0,0.0,0.0,33199.0,35.6,22.0,0,0,2000-12-01,0.0,1.0,0.0,8.0,11.0,11.0,9.0,11.0,3.0,16.0,18.0,11.0,19.0,0.0,0.0,0.0,1.0,2017,10,6,1,2000,12,8797,293.233333,24.10137


'空值和时间类型数据处理done:'

(200000, 55)

'对类别数据进行编码 done'

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month,earliesCreditLine_diff_days,earliesCreditLine_diff_month,earliesCreditLine_diff_year
0,800000,14000.0,3,10.99,458.28,1,7,3902,10.0,0,80000.0,0,2014-07-01,0,163,21,10.56,1.0,715.0,719.0,17.0,0.0,0.0,9846.0,30.7,29.0,0,0,1974-11-01,0,1.0,1.0,4.0,6.0,6.0,6.0,8.0,4.0,15.0,19.0,6.0,17.0,0.0,0.0,1.0,3.0,2014,7,1,0,1974,11,18324,610.8,50.20274
1,800001,20000.0,5,14.65,472.14,2,14,21289,10.0,0,50000.0,0,2015-07-01,2,235,8,21.4,2.0,670.0,674.0,5.0,0.0,0.0,8946.0,56.6,14.0,0,0,2001-07-01,5,1.0,2.0,1.0,3.0,3.0,1.0,1.0,3.0,3.0,9.0,3.0,5.0,0.0,0.0,2.0,2.0,2015,7,2,0,2001,7,8585,286.166667,23.520548
2,800002,12000.0,3,19.99,445.91,3,18,10659,2.0,1,60000.0,2,2016-10-01,0,526,20,33.5,0.0,710.0,714.0,12.0,0.0,0.0,970.0,17.6,43.0,1,0,2006-08-01,0,1.0,0.0,1.0,4.0,4.0,1.0,1.0,36.0,5.0,6.0,4.0,12.0,0.0,0.0,0.0,7.0,2016,10,5,1,2006,8,6728,224.266667,18.432877
3,800003,17500.0,5,14.31,410.02,2,13,434,4.0,0,37000.0,1,2014-11-01,4,248,11,13.95,0.0,685.0,689.0,10.0,1.0,1.0,10249.0,52.3,18.0,0,0,2002-07-01,4,1.0,0.0,2.0,2.0,2.0,4.0,7.0,2.0,8.0,14.0,2.0,10.0,0.0,0.0,0.0,3.0,2014,11,5,1,2002,7,8220,274.0,22.520548
4,800004,35000.0,3,17.09,1249.42,3,15,77752,0.0,1,80000.0,1,2017-10-01,0,115,8,24.97,0.0,685.0,689.0,19.0,0.0,0.0,33199.0,35.6,22.0,0,0,2000-12-01,0,1.0,0.0,8.0,11.0,11.0,9.0,11.0,3.0,16.0,18.0,11.0,19.0,0.0,0.0,0.0,1.0,2017,10,6,1,2000,12,8797,293.233333,24.10137


特征交叉 done
train_size: (800000, 98) test_size: (200000, 97) data_process_size: (1000000, 98)


'删除无用特征 done: '

(1000000, 94)

In [9]:
# data["earliesCreditLine_diff_days"] = (pd.to_datetime('20250101', format='%Y%m%d') - data["earliesCreditLine"]).dt.days
# data["earliesCreditLine_diff_month"] = (data["earliesCreditLine"] - pd.to_datetime('today')).dt.month
# data["earliesCreditLine_diff_year"] = (data["earliesCreditLine"] - pd.to_datetime('today')).dt.year
(pd.to_datetime('20250101', format='%Y%m%d') - data["earliesCreditLine"]).dt.days

0          8554
1          8281
2          6820
3          9377
4         17320
          ...  
799995     4902
799996    13029
799997     8220
799998    11323
799999     8370
Name: earliesCreditLine, Length: 800000, dtype: int64

Unnamed: 0,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,title,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month
999995,7000.0,3,11.14,229.64,1,6,72102,7.0,1,30000.0,2,,4,36,13,32.92,0.0,685.0,689.0,8.0,0.0,0.0,5021.0,54.0,17.0,1,0,10671,0.0,2.0,2.0,2.0,2.0,6.0,11.0,2.0,6.0,2.0,8.0,0.0,0.0,0.0,4.0,2012,10,0,0,2005,11
999996,6000.0,3,6.24,183.19,0,1,15541,1.0,1,56000.0,0,,0,261,8,20.38,0.0,720.0,724.0,25.0,0.0,0.0,10163.0,35.9,27.0,0,0,0,0.0,5.0,6.0,6.0,5.0,5.0,14.0,12.0,13.0,6.0,25.0,0.0,0.0,0.0,0.0,2015,10,3,0,2006,10
999997,14000.0,5,15.88,339.57,2,13,63958,8.0,2,80000.0,2,,4,46,27,15.12,0.0,675.0,679.0,21.0,0.0,0.0,31401.0,86.0,63.0,0,0,4,0.0,7.0,13.0,13.0,9.0,21.0,18.0,21.0,42.0,13.0,21.0,0.0,0.0,0.0,0.0,2013,7,0,0,2001,12
999998,8000.0,3,18.06,289.47,3,16,73,4.0,1,190000.0,0,,0,99,8,9.33,0.0,690.0,694.0,11.0,0.0,0.0,28493.0,64.0,24.0,0,0,0,0.0,3.0,6.0,6.0,4.0,12.0,5.0,8.0,19.0,6.0,11.0,0.0,0.0,0.0,2.0,2017,10,6,1,2005,8
999999,8000.0,3,6.68,245.85,0,2,228,7.0,1,46000.0,0,,4,31,2,6.47,0.0,715.0,719.0,4.0,0.0,0.0,7608.0,65.6,10.0,0,0,4,0.0,2.0,2.0,2.0,2.0,3.0,4.0,3.0,4.0,2.0,4.0,0.0,0.0,0.0,0.0,2015,2,6,1,2002,8


In [4]:
# data.iloc[:train_size,:].to_csv('./data/train_processed.csv')
# data.iloc[train_size:,:].to_csv('./data/test_processed.csv')
data.to_csv ('./data/data_processed_v6_1.csv', index=False)


In [98]:
# def find_outliers_by_3segama(data, fea):
#     data_std = np.std(data[fea])
#     data_mean = np.mean(data[fea])
#     outliers_cut_off = data_std * 3
#     lower_rule = data_mean - outliers_cut_off
#     upper_rule = data_mean + outliers_cut_off
#     data = data[(data[fea] >= lower_rule) & (data[fea] <= upper_rule)]
#     # data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
#     print(data.shape)
#     return data
# data_train = pd.read_csv("./data/train.csv")
# data_test = pd.read_csv("./data/testA.csv")
# data = pd.concat([data_train,data_test], axis=0)
# display(data.shape)
# train_data = data.iloc[:800000, :]
# test_da = find_outliers_by_3segama(train_data,"interestRate")
# test_da.shape

In [104]:
data_process.iloc[800000:,:].isna().sum()

loanAmnt                   0.0
term                       0.0
interestRate               0.0
installment                0.0
grade                      0.0
subGrade                   0.0
employmentTitle            0.0
employmentLength           0.0
homeOwnership              0.0
annualIncome               0.0
verificationStatus         0.0
isDefault                  0.0
purpose                    0.0
postCode                   0.0
regionCode                 0.0
dti                        0.0
delinquency_2years         0.0
ficoRangeLow               0.0
ficoRangeHigh              0.0
openAcc                    0.0
pubRec                     0.0
pubRecBankruptcies         0.0
revolBal                   0.0
revolUtil                  0.0
totalAcc                   0.0
initialListStatus          0.0
applicationType            0.0
title                      0.0
n0                         0.0
n1                         0.0
n2                         0.0
n3                         0.0
n4      