In [1]:
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 9999999)

In [2]:
def generate_feature(row):
    # 将两个数字拼接
    list = []    
    list.append(row['hypertension'])
    list.append(row['heart_disease'])
    return list

def generate_text(row):
    return f"""
gender: {row['gender']}
age: {row['age']}
ever_married: {row['ever_married']}
work_type: {row['work_type']}
Residence_type: {row['Residence_type']}
avg_glucose_level: {row['avg_glucose_level']}
bmi: {row['bmi']}
smoking_status: {row['smoking_status']}
""".strip()

def generate_label(row):
    return row['stroke']

In [45]:
# 打开data\healthcare-dataset-stroke-data.csv文件
train_df = pd.read_csv('./dataset.csv') # 数据地址: https://www.kaggle.com/datasets/shashwatwork/cerebral-stroke-predictionimbalaced-dataset
eval_df = pd.read_csv('./healthcare-dataset-stroke-data.csv') # 数据地址: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/
# 将数据集中的数据按照一定的规则进行处理
new_train_df = pd.DataFrame()
new_train_df['feature'] = train_df.apply(generate_feature, axis=1)
new_train_df['text'] = train_df.apply(generate_text, axis=1)
new_train_df['label'] = train_df.apply(generate_label, axis=1)
new_eval_df = pd.DataFrame()
new_eval_df['feature'] = eval_df.apply(generate_feature, axis=1)
new_eval_df['text'] = eval_df.apply(generate_text, axis=1)
new_eval_df['label'] = eval_df.apply(generate_label, axis=1)


# 查看处理后的数据
print(new_train_df.head())
print(new_eval_df.head())

# train 的0, 1标签各保留1024个 但是1标签显然不足1024个 正好可以模拟数据不均衡的情况
new_train_df = new_train_df.groupby('label').head(1024)

# eval 的0, 1标签各保留128个
new_eval_df = new_eval_df.groupby('label').head(128)


  feature                                               text  label
0  [0, 0]  gender: Male\nage: 3.0\never_married: No\nwork...      0
1  [1, 0]  gender: Male\nage: 58.0\never_married: Yes\nwo...      0
2  [0, 0]  gender: Female\nage: 8.0\never_married: No\nwo...      0
3  [0, 0]  gender: Female\nage: 70.0\never_married: Yes\n...      0
4  [0, 0]  gender: Male\nage: 14.0\never_married: No\nwor...      0
  feature                                               text  label
0  [0, 1]  gender: Male\nage: 67.0\never_married: Yes\nwo...      1
1  [0, 0]  gender: Female\nage: 61.0\never_married: Yes\n...      1
2  [0, 1]  gender: Male\nage: 80.0\never_married: Yes\nwo...      1
3  [0, 0]  gender: Female\nage: 49.0\never_married: Yes\n...      1
4  [1, 0]  gender: Female\nage: 79.0\never_married: Yes\n...      1


In [46]:
# 将新数据以datasets库中的数据集的形式保存
from datasets import Dataset, DatasetDict, load_from_disk
train_dataset = Dataset.from_pandas(new_train_df)
eval_dataset = Dataset.from_pandas(new_eval_df)

# 融合为一个数据集
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'eval': eval_dataset
})

In [47]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['feature', 'text', 'label'],
        num_rows: 43400
    })
    eval: Dataset({
        features: ['feature', 'text', 'label', '__index_level_0__'],
        num_rows: 256
    })
})


In [48]:
# 保存数据集
dataset_dict.save_to_disk('./healthcare_stroke')

Saving the dataset (0/1 shards):   0%|          | 0/43400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/256 [00:00<?, ? examples/s]

In [49]:
# 加载查看
train_dataset = load_from_disk('healthcare_stroke')['train']
print(train_dataset)
eval_dataset = load_from_disk('healthcare_stroke')['eval']
print(eval_dataset)

Dataset({
    features: ['feature', 'text', 'label'],
    num_rows: 43400
})
Dataset({
    features: ['feature', 'text', 'label', '__index_level_0__'],
    num_rows: 256
})


In [50]:
# 查看两个集的类别分布
zero = 0
one = 0
for i in train_dataset['label']:
    if i == 0:
        zero += 1
    else:
        one += 1
print('train_dataset:', zero, one)

train_dataset: 42617 783


In [51]:
zero = 0
one = 0
for i in eval_dataset['label']:
    if i == 0:
        zero += 1
    else:
        one += 1
print('eval_dataset:', zero, one)

eval_dataset: 128 128
