In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('./data/processed.csv')
df_test = pd.read_csv('./data/processed_test.csv')
df_submit = pd.read_csv('./data/submission.csv')
feature_info = pd.read_excel('./data/字段说明.xlsx')
feature_info = dict(zip(feature_info['名称'], feature_info['标签']))

In [3]:
target_corr = []
for col in df.columns:
    if col != 'bad_good':
        target_corr.append((col, abs(df['bad_good'].corr(df[col]))))
target_corr.sort(key=lambda item: item[1])
cols = [item[0] for item in target_corr[-64: ]]
# 取相关性最大的前64列

In [4]:
# 定义prompt模板，64列加上prompt每个query最长1279
def create_prompt(example):
    data = ''
    for col in cols:
        data += f'{feature_info[col]}：{example[col]}；'
    return f"以下为某银行真实信贷用户信息：\n{data}\n请根据这些信息判断该用户信贷是否逾期，仅输出0或1："
# 保存预处理完成的数据
def save_dataset(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for _, example in data.iterrows():
            prompt_item = create_prompt(example)
            item = {
                'query': prompt_item,
                'response': str(int(example['bad_good']))
            }
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
# 采样数据并划分为训练集、验证集
def sample_and_split_data(data, sample_size=10000, valid_size=0.1):
    sampled_data = data.sample(sample_size)
    train_data, valid_data = train_test_split(sampled_data, test_size=valid_size)
    return train_data, valid_data


In [33]:
data = df[cols + ['bad_good']]
df_test['bad_good'] = -1
test_data = df_test[cols + ['bad_good']]
train_data, valid_data = sample_and_split_data(data, sample_size=10000, valid_size=0.1)

In [38]:
train_file_path = './data/train_dataset.jsonl'
valid_file_path = './data/valid_dataset.jsonl'
test_file_path = './data/test_dataset.jsonl'
save_dataset(train_data, train_file_path)
save_dataset(valid_data, valid_file_path)
save_dataset(test_data, test_file_path)

In [8]:
# 请在模型微调之后并测试集上得到推理结果后再运行以下单元格
df_result = pd.read_json('output/qwen1half-1_8b-chat/v0-20240629-164430/checkpoint-562/infer_result/20240629-175456.jsonl', lines=True)
df_pred = pd.DataFrame({'bad_good': df_result['response']})
df_pred.insert(0, 'CUST_ID', df_submit['CUST_ID'])

In [9]:
df_pred['bad_good'].value_counts()

bad_good
0    186690
1      3076
Name: count, dtype: int64

In [None]:
df_pred.to_csv('result_Qwen1.5.csv', mode='w', index=False)