## Created by yunsuxiaozi 2024/3/12

### This is the notebook after a two-week break from the competition,you can check the running time of my program.After the competition restarted, I have experienced "threw exception" three times, and here I am trying to implement a baseline with as few files as possible.Linear regression is used as the baseline here.

### Import necessary libraries

In [1]:
import polars as pl#和pandas类似,但是处理大型数据集有更好的性能.
#necessary
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
#model
from lightgbm import LGBMClassifier
#metric
from sklearn.metrics import roc_auc_score#导入roc_auc曲线
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD#截断奇异值分解,是一种数据降维的方法
import dill#对对象进行序列化和反序列化(例如保存和加载树模型)
import gc#垃圾回收模块
import time#标准库的时间模块
#为了方便后期调用训练的模型时不会调用错版本,提供模型训练的时间
#time.strftime()函数用于将时间对象格式化为字符串，time.localtime()函数返回表示当前本地时间的time.struct_time对象
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("this notebook training time is ", current_time)

this notebook training time is  2024-03-12 01:57:17


### Config and random seed.

In [2]:
#config
class Config():
    seed=2024
    num_folds=10
    TARGET_NAME ='target'
    batch_size=1000#由于不知道测试数据的大小,所以分批次放入模型.
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)

### Feature engineer

In [3]:
def preprocessor(mode='train'):#mode='train'|'test'
    #base 文件
    print("base file")
    feats=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_base.csv")
        
    print("deposit file num 1")
    deposit=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_deposit_1.csv")
    #数值列的特征工程  从1开始是为了把'case_id'去掉    
    for idx in range(1,len(deposit.columns)):
        col=deposit.columns[idx]
        column_type = deposit[col].dtype
        is_numeric = (column_type == pl.datatypes.Int64) or (column_type == pl.datatypes.Float64) 
        if is_numeric:#数值列构造特征
            feat=deposit.group_by('case_id').agg( pl.max(col).alias(f"max_deposit_{col}"),
                                           pl.mean(col).alias(f"mean_deposit_{col}"),
                                           pl.median(col).alias(f"median_deposit_{col}"),
                                           pl.std(col).alias(f"std_deposit_{col}"),
                                           pl.min(col).alias(f"min_deposit_{col}"),
                                           pl.count(col).alias(f"count_deposit_{col}"),
                                           pl.sum(col).alias(f"sum_deposit_{col}"),
                                           pl.n_unique(col).alias(f"n_unique_deposit_{col}"),
                                           pl.first(col).alias(f"first_deposit_{col}"),
                                           pl.last(col).alias(f"last_deposit_{col}")
                                         )
            feats=feats.join(feat,on='case_id',how='left')

    #static_cb文件
    print("static_cb_file num 1")
    static_cb=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_cb_0.csv")
    feats=feats.join(static_cb,on='case_id',how='left')
    del static_cb
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
     
    return feats

print("---------Feature Engineer-------------\n")
train_feats=preprocessor(mode='train')
#由于这是时间序列数据,加上数据量太大,构造太多的特征就会超内存了,所以这里考虑就用最后一年的数据来训练模型,以达到最好效果.
#这里的时间差不多是从2019年10月初开始.
train_feats=train_feats.filter(train_feats['WEEK_NUM']>=40)
print(f"len(train_feats):{len(train_feats)}")

test_feats=preprocessor(mode='test')
print(f"len(test_feats):{len(test_feats)}")

print("---------Feature Transformer-------------\n")
"""
下面是对特征进行处理,提取出来的特征最初是有字符串类型和数值类型.
1.先对字符串 类别型变量 进行 one hot encoder
2.剩下的字符串直接去掉,一列有唯一值的去掉,缺失值占比>0.95的列选择drop.
这样数据中就只有数值列了.
3.填充缺失值,这里考虑用-1填充.
4.数值列先将相关性高达0.99的列只保留一列,其余去掉.
5.对相关性高的那些列进行降维处理.
"""

#如果打开的两个文件的相同列一个是浮点数类型,一个是object或者str,就把两个都转成浮点数类型.
for col in test_feats.columns:
    if (train_feats[col].dtype==pl.datatypes.Float64) or (test_feats[col].dtype==pl.datatypes.Float64):
        train_feats.with_columns(train_feats[col].cast(pl.datatypes.Float64))
        test_feats.with_columns(test_feats[col].cast(pl.datatypes.Float64))
train_feats=train_feats.to_pandas()
test_feats=test_feats.to_pandas()

#对字符串特征列进行独热编码的转换
print("----------string one hot encoder ****")
for col in test_feats.columns:
    n_unique=train_feats[col].nunique()
    #如果是类别型变量的话,独热编码转换
    #如果类别是2类,像性别一样,如果是(0,1)了,或者说数值类型的话,没必要转换.如果是字符串类型的话,转换成数值
    if n_unique==2 and train_feats[col].dtype=='object':
        print(f"one_hot_2:{col}")
        unique=train_feats[col].unique()
        #随便选择一个类别进行转换,比如gender='Female'
        train_feats[col]=(train_feats[col]==unique[0]).astype(int)
        test_feats[col]=(test_feats[col]==unique[0]).astype(int)
    elif (n_unique<10) and train_feats[col].dtype=='object':#由于内存有限 类别型变量的n_unique设置为20
        print(f"one_hot_10:{col}")
        unique=train_feats[col].unique()
        for idx in range(len(unique)):
            if unique[idx]==unique[idx]:#这里是为了避免字符串中存在nan值的情况
                train_feats[col+"_"+str(idx)]=(train_feats[col]==unique[idx]).astype(int)
                test_feats[col+"_"+str(idx)]=(test_feats[col]==unique[idx]).astype(int)
        train_feats.drop([col],axis=1,inplace=True)
        test_feats.drop([col],axis=1,inplace=True)

#如果是字符串的列或者一列只有唯一值,去掉
print("----------drop other string or unique value full null value ****")
drop_cols=[]
for col in test_feats.columns:
    if (train_feats[col].dtype=='object') or (test_feats[col].dtype=='object') \
        or (train_feats[col].nunique()==1) or train_feats[col].isna().mean()>0.95:
        drop_cols+=[col]
#case_id目前看来和id一样没什么用,WEEK_NUM在测试数据中比训练数据大.
drop_cols+=['case_id','WEEK_NUM','MONTH']
print(f"len(drop_cols):{len(drop_cols)},drop_cols:{drop_cols}")
train_feats=train_feats.drop(drop_cols,axis=1)
test_feats=test_feats.drop(drop_cols,axis=1)

print("----------fillna value ****")
train_feats.fillna(-1,inplace=True)
test_feats.fillna(-1,inplace=True)

print(f"len(drop_cols):{len(drop_cols)},total_features_count:{len(test_feats.columns)}")
train_feats.head()

---------Feature Engineer-------------

base file
deposit file num 1
static_cb_file num 1
len(train_feats):771863
base file
deposit file num 1
static_cb_file num 1
len(test_feats):10
---------Feature Transformer-------------

----------string one hot encoder ****
one_hot_10:assignmentdate_238D
one_hot_10:birthdate_574D
one_hot_2:description_5085714M
one_hot_10:education_1103M
one_hot_10:education_88M
one_hot_10:maritalst_385M
one_hot_10:maritalst_893M
one_hot_10:requesttype_4525192L
one_hot_10:responsedate_1012D
----------drop other string or unique value full null value ****
len(drop_cols):48,drop_cols:['date_decision', 'std_deposit_amount_416A', 'std_deposit_num_group1', 'min_deposit_num_group1', 'assignmentdate_4527235D', 'assignmentdate_4955616D', 'contractssum_5085716L', 'dateofbirth_337D', 'dateofbirth_342D', 'for3years_128L', 'for3years_504L', 'for3years_584L', 'formonth_118L', 'formonth_206L', 'formonth_535L', 'forquarter_1017L', 'forquarter_462L', 'forquarter_634L', 'fortoday_

Unnamed: 0,target,max_deposit_amount_416A,mean_deposit_amount_416A,median_deposit_amount_416A,min_deposit_amount_416A,count_deposit_amount_416A,sum_deposit_amount_416A,n_unique_deposit_amount_416A,first_deposit_amount_416A,last_deposit_amount_416A,...,requesttype_4525192L_1,requesttype_4525192L_2,responsedate_1012D_1,responsedate_1012D_2,responsedate_1012D_3,responsedate_1012D_4,responsedate_1012D_5,responsedate_1012D_6,responsedate_1012D_7,responsedate_1012D_8
0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
1,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1,0,0,0,0,0,0,0,0,0
2,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
3,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
4,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,1,0,0,0,0,0,0,0


### Feature Selection

In [4]:
#我们这里就是找相关性特别高的特征对,可以考虑对它们进行降维操作.
#计算两组变量的皮尔逊相关系数
def pearson_corr(x1,x2):
    """
    x1,x2:np.array
    """
    mean_x1=np.mean(x1)
    mean_x2=np.mean(x2)
    std_x1=np.std(x1)
    std_x2=np.std(x2)
    pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2)
    return pearson
#有没有和target相关性特别高的特征,拿来做逻辑回归
choose_cols=[]
for col in train_feats.columns:
    if col!='target':
        pearson=pearson_corr(train_feats[col].values,train_feats['target'].values) 
        if abs(pearson)>0.002:
            choose_cols.append(col)
len(choose_cols),choose_cols

(53,
 ['max_deposit_amount_416A',
  'mean_deposit_amount_416A',
  'median_deposit_amount_416A',
  'min_deposit_amount_416A',
  'count_deposit_amount_416A',
  'sum_deposit_amount_416A',
  'n_unique_deposit_amount_416A',
  'first_deposit_amount_416A',
  'last_deposit_amount_416A',
  'max_deposit_num_group1',
  'mean_deposit_num_group1',
  'median_deposit_num_group1',
  'count_deposit_num_group1',
  'sum_deposit_num_group1',
  'n_unique_deposit_num_group1',
  'first_deposit_num_group1',
  'last_deposit_num_group1',
  'days120_123L',
  'days180_256L',
  'days30_165L',
  'days360_512L',
  'days90_310L',
  'description_5085714M',
  'firstquarter_103L',
  'fourthquarter_440L',
  'numberofqueries_373L',
  'secondquarter_766L',
  'thirdquarter_1082L',
  'assignmentdate_238D_1',
  'education_1103M_0',
  'education_1103M_1',
  'education_1103M_2',
  'education_1103M_3',
  'education_1103M_4',
  'education_88M_0',
  'education_88M_1',
  'education_88M_2',
  'education_88M_3',
  'maritalst_385M_0',

### K-fold and Model training

In [5]:
from sklearn.linear_model import LinearRegression
# 创建逻辑回归模型
model = LinearRegression()

#保存训练好的树模型,obj是保存的模型,path是需要保存的路径
def pickle_dump(obj, path):
    #打开指定的路径path,binary write(二进制写入)
    with open(path, mode="wb") as f:
        #将obj对象保存到f,使用协议版本4进行序列化
        dill.dump(obj, f, protocol=4)
        
X=train_feats[choose_cols].copy()
y=train_feats[Config.TARGET_NAME].copy()
test_X=test_feats[choose_cols].copy()#.drop([Config.TARGET_NAME],axis=1)
oof_pred_pro=np.zeros((len(X)))
test_pred_pro=np.zeros((Config.num_folds,len(test_X)))
#10折交叉验证
skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)

for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
    print(f"fold:{fold}")

    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    model.fit(X_train,y_train)

    oof_pred_pro[valid_index]=model.predict(X_valid)
    #将数据分批次进行预测.
    for idx in range(0,len(test_X),Config.batch_size):
        test_pred_pro[fold][idx:idx+Config.batch_size]=model.predict(test_X[idx:idx+Config.batch_size]) 
    pickle_dump(model, f'/kaggle/working/linear_fold{fold}.model') #保存训练好的模型  

gini=2*roc_auc_score(y.values,oof_pred_pro)-1
print(f"mean_gini:{gini}")

fold:0
fold:1
fold:2
fold:3
fold:4
fold:5
fold:6
fold:7
fold:8
fold:9
mean_gini:0.2948356116504791


### Submission

In [6]:
test_preds=test_pred_pro.mean(axis=0)
submission=pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
submission['score']=np.clip(np.nan_to_num(test_preds,nan=0.3),0,1)
submission.to_csv("submission.csv",index=None)
submission.head()

Unnamed: 0,case_id,score
0,57543,0.028631
1,57549,0.09167
2,57551,0.011964
3,57552,0.029371
4,57569,0.036168
