In [1]:
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [2]:
def default_Y_generator(df):
    # 生成违约的Y
    default_df = df[df['逾期金额'] > 0]
    print(f'违约数据集大小：{default_df.shape}')
    df['存在逾期贷款'] = df.groupby('Stkcd')['逾期金额'].shift(-1).fillna(0).apply(lambda x: 1 if x > 0 else 0)
    print(f'生成违约Y：{df["存在逾期贷款"].value_counts()}')
    return df

def isnull_filter(df, threshold):
    # 缺失值筛选
    # 计算每列缺失值的比例
    missing_ratio = df.isnull().mean()
    # 找出需要保留的列（缺失比例小于等于阈值）
    columns_to_keep = missing_ratio[missing_ratio <= threshold].index
    # 删去了哪些列
    columns_to_drop = missing_ratio[missing_ratio > threshold].index
    print(f'根据缺失比例，删去的列：{columns_to_drop}')
    # 返回筛选后的 DataFrame
    df = df[columns_to_keep]
    print(f'缺失值筛选后数据集大小：{df.shape}')
    return df

def encode_generator(df):
    df = df.copy()
    label_encoders = {}
    encoded_columns = []
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col].astype(float)
            except ValueError:
                print(f'编码 {col} 列...')
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])
                label_encoders[col] = le
                encoded_columns.append(col)
    print(f'被编码的列名：{encoded_columns}')
    return df, label_encoders, encoded_columns

def correlation_filter(df, threshold=0.9):
    df = df.copy()
    corr_matrix = df.corr().abs()
    high_corr_pairs = []
    to_drop = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):
            col1 = corr_matrix.columns[i]
            col2 = corr_matrix.columns[j]
            if corr_matrix.iloc[i, j] > threshold:
                high_corr_pairs.append((col1, col2))
                to_drop.add(col2)
    df_filtered = df.drop(columns=list(to_drop))
    print(f'高相关性的列对：{high_corr_pairs}')
    print(f'被删除的列名：{list(to_drop)}')
    print(f'筛选后的 DataFrame 大小：{df_filtered.shape}')
    return df_filtered, high_corr_pairs, list(to_drop)

In [5]:
def train_and_evaluate_logistic_regression_with_tuning(df, target_column, test_size=0.2, random_state=42, C=[1.0], penalty=['l2']):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('log_reg', LogisticRegression(solver='liblinear'))  # 使用逻辑回归分类器
    ])
    
    param_grid = {
        'log_reg__C': C,
        'log_reg__penalty': penalty
    }
    
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', error_score='raise')
    
    try:
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        y_pred_best = best_model.predict(X_test)
        best_accuracy = accuracy_score(y_test, y_pred_best)
        best_report = classification_report(y_test, y_pred_best)
        return best_accuracy, best_report, best_params
    except Exception as e:
        print(f"训练过程中发生错误: {e}")
        return None, None, None
def main():
    df = pd.read_csv('merged_data_final.csv')
    print(f'原始数据集大小：{df.shape}')
    print(df.head(5))
    df = default_Y_generator(df)
    threshold = 0.3
    df = isnull_filter(df, threshold)
    encoded_df, label_encoders, encoded_columns = encode_generator(df)
    print('编码后的 DataFrame:')
    print(encoded_df.head(5))
    df_filtered, high_corr_pairs, to_drop = correlation_filter(encoded_df)
    print('筛选后的 DataFrame:')
    print(df_filtered.head(5))
    result = train_and_evaluate_logistic_regression_with_tuning(
        df_filtered, 
        target_column='存在逾期贷款', 
        test_size=0.2, 
        random_state=42, 
        C=[1.0], 
        penalty=['l2']
    )
    
    if result[0] is not None:
        print(f"最佳准确率: {result[0]:.4f}")
        print(f"最佳超参数: {result[2]}")
        print("最佳分类报告:")
        print(result[1])
    else:
        print("模型训练失败，请检查错误信息。") 
if __name__ == '__main__':
    main()


原始数据集大小：(104851, 155)
   Stkcd ShortName_FS_Comins      Accper Typrep         营业总收入          营业收入  \
0      2                深万科A  2000-12-31      A  3.783669e+09  3.783669e+09   
1      2                深万科A  2001-06-30      A  2.432696e+09  2.432696e+09   
2      2                深万科A  2001-12-31      A  4.455065e+09  4.455065e+09   
3      2                 万科A  2002-06-30      A  1.501989e+09  1.501989e+09   
4      2                 万科A  2002-12-31      A  4.574360e+09  4.574360e+09   

   利息支出         营业总成本          营业成本  其中：利息费用(财务费用)  ...          其他借款  \
0   NaN  3.436558e+09  2.839928e+09            NaN  ...  6.000000e+08   
1   NaN  2.208490e+09  1.831923e+09            NaN  ...  1.131000e+09   
2   NaN  4.157431e+09  3.434440e+09            NaN  ...           NaN   
3   NaN  1.325194e+09  1.072018e+09            NaN  ...           NaN   
4   NaN  4.062517e+09  3.472886e+09            NaN  ...           NaN   

   资产负债率_y  信用借款比例  抵押借款比例  担保借款比例  逾期金额  展期金额  贷款偿还率        主营业

最佳准确率: 0.9489
最佳超参数: {'log_reg__C': 1.0, 'log_reg__penalty': 'l2'}
最佳分类报告:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     19868
           1       0.60      0.09      0.15      1103

    accuracy                           0.95     20971
   macro avg       0.78      0.54      0.56     20971
weighted avg       0.93      0.95      0.93     20971

