In [1]:
import numpy as np
import pandas as pd

from datetime import datetime

import re

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用 SimHei 字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

In [2]:
# 指定文件路径
file_path = "data20240512.xlsx"

# 读取数据集
data = pd.read_excel(file_path, sheet_name='Sheet1')

# 重新命名列
data.rename(columns={'疾病进展发生日期PFS（0代表统计时未进展）': 'PFS', '发生时间OS': 'OS'}, inplace=True)

# 读取可匹配的行数
data = data[:300]
print(data.shape)

(300, 90)


In [3]:
formats = ["%Y%m%d", "%Y/%m/%d", "%Y%m", "%Y.%m.%d", "%Y.%m", "%Y-%m-%d", "%Y-%m"]

def convertDate(text):
    for f in formats:
        try:
            return datetime.strptime(text, f)
        except ValueError:
            pass
    raise ValueError(f"Cannot convert date: {text}")

data['PFS'].fillna(0, inplace=True)
# 创建一个空的DataFrame，并定义列名
PFSprocessed = pd.DataFrame(columns=['PFSprocessed'])  
for index, row in data.iterrows():
    line = [row['ICI starting time'].strftime('%Y-%m-%d'), str(row['PFS']).split()[0]]
    try:
        A = convertDate(line[0].strip())
        B = convertDate(line[1].strip()) 
        tdays = (B-A).days
        PFSprocessed = PFSprocessed.append({'PFSprocessed': tdays}, ignore_index=True)
        # print(B, A,(B-A).days)
        # output.writerow(line[:] + [(B-A).days])
    except ValueError:
        try:
            days = int(line[1].strip())
            PFSprocessed = PFSprocessed.append({'PFSprocessed': days}, ignore_index=True)
            # output.writerow(line[:] + [days])
        except ValueError:
            # print(line[0], line[1])
            # output.writerow(line[:] + ["pass"])
            PFSprocessed = PFSprocessed.append({'PFSprocessed': 0}, ignore_index=True)
            
# 使用concat函数将两个DataFrame按列拼接起来
data = pd.concat([data, PFSprocessed], axis=1)
 
# Note:数据问题，出现负数，记为0
data['PFSprocessed'] = data['PFSprocessed'].apply(lambda x: 0 if x < 0 else x)
# 0记为删失
data['PFSprocessed'].replace({0: np.nan, np.nan: np.nan})

0         NaN
1       280.0
2         NaN
3       544.0
4         NaN
        ...  
295     125.0
296    1125.0
297     162.0
298     256.0
299     334.0
Name: PFSprocessed, Length: 300, dtype: float64

In [4]:
# # 查看数据类型
# for row_index in range(300):
#     # 查看指定行的数据类型
#     value = data.loc[row_index, 'OS']

#     if isinstance(value, datetime):
#         data_type = 'datetime'
#     else:
#         data_type = type(value)

#     print(f"第 {row_index} 行的数据类型为：{data_type}")
    

# 内容缺失记为：未观测到结局 0
data['OS'].fillna(0, inplace=True)
# 定义一个函数，用于根据数据类型进行转换
def convert_data_type(value):
    if value == 0 or isinstance(value, str):
        return 0
    elif  isinstance(value, datetime) or isinstance(value, int): #   or value.isdigit() 
        # print(value)
        return 1
    else:
        return value  # 其他类型保持不变

# 对列进行转换
data['OSprocessed'] = data['OS'].apply(convert_data_type)
print('数据的结局情况：', data['OSprocessed'].value_counts())

数据的结局情况： 0    230
1     70
Name: OSprocessed, dtype: int64


In [5]:
# 将处理后的DataFrame保存为Excel文件
data.to_excel('data_processed.xlsx', index=False)