In [2]:
import pandas as pd

# 示例数据
data = {
    'Age': [23, 45, 67, 34, 22, 18, 90, 55, 43, 29, 31, 77, 85, 62, 49, 15, 5, 3, 99, 100]
}
df = pd.DataFrame(data)

# 创建年龄段区间
bins = list(range(0, 105, 5))  # 0-100，每5岁一个区间
labels = [f'{i+1}-{i+5}' for i in range(0, 100, 5)]  # 调整labels数量

# 将年龄分段，左开右闭
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

# 统计每个区间的个数
age_group_counts = df['Age_Group'].value_counts().sort_index()

# 计算每个区间的百分比
total_count = len(df)
age_group_percentages = (age_group_counts / total_count) * 100

# 合并统计结果
result = age_group_counts.astype(str) + ' (' + age_group_percentages.round(2).astype(str) + '%)'

# 输出统计结果
print(result)

Age_Group
1-5       2 (10.0%)
6-10       0 (0.0%)
11-15      1 (5.0%)
16-20      1 (5.0%)
21-25     2 (10.0%)
26-30      1 (5.0%)
31-35     2 (10.0%)
36-40      0 (0.0%)
41-45     2 (10.0%)
46-50      1 (5.0%)
51-55      1 (5.0%)
56-60      0 (0.0%)
61-65      1 (5.0%)
66-70      1 (5.0%)
71-75      0 (0.0%)
76-80      1 (5.0%)
81-85      1 (5.0%)
86-90      1 (5.0%)
91-95      0 (0.0%)
96-100    2 (10.0%)
Name: count, dtype: object


In [3]:
import pandas as pd

# 读取 Excel 文件
file_path = "/home/ubuntu/graduation-project/data/Extract/gemma2-9b_final_processed_20250106_211148_final.xlsx"
df = pd.read_excel(file_path)

# 创建年龄段区间
bins = list(range(0, 105, 5))  # 0-100，每5岁一个区间
labels = [f'{i+1}-{i+5}' for i in range(0, 100, 5)]  # 调整labels数量

# 将年龄分段，左开右闭
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

# 按 disease 分组
grouped = df.groupby('disease')

# 创建一个空的 DataFrame 来存储结果
result_df = pd.DataFrame()

# 统计每个分组的年龄段个数和百分比
for disease, group in grouped:
    age_group_counts = group['Age_Group'].value_counts().sort_index()
    total_count = len(group)
    age_group_percentages = (age_group_counts / total_count) * 100
    result = age_group_counts.astype(str) + ' (' + age_group_percentages.round(2).astype(str) + '%)'
    
    # 将结果添加到 result_df 中
    result_df[disease] = result

# 输出统计结果到新的 Excel 文件
output_path = "/home/ubuntu/graduation-project/data/statasticsgemma2-9b_age_group_statistics.xlsx"
result_df.to_excel(output_path, index=True)

print(f"统计结果已保存至: {output_path}")

统计结果已保存至: /home/ubuntu/graduation-project/data/statasticsgemma2-9b_age_group_statistics.xlsx


In [4]:
import pandas as pd

# 文件路径
source_file = "/home/ubuntu/graduation-project/data/EMR/gemma2-9b.xlsx"
target_file = "/home/ubuntu/graduation-project/data/Extract/outcome/gemma2-9b_final.xlsx"

# 读取源文件和目标文件
source_df = pd.read_excel(source_file)
target_df = pd.read_excel(target_file)

# 确保两者行数一致
if len(source_df) != len(target_df):
    raise ValueError("源文件和目标文件的行数不一致，无法直接复制disease列")

# 将disease列从源文件复制到目标文件
target_df['disease'] = source_df['disease']

# 保存修改后的目标文件
output_file = "/home/ubuntu/graduation-project/data/Extract/outcome/gemma2-9b_final_modified.xlsx"
target_df.to_excel(output_file, index=False)

print(f"disease列已更新，修改后的文件已保存至: {output_file}")

disease列已更新，修改后的文件已保存至: /home/ubuntu/graduation-project/data/Extract/outcome/gemma2-9b_final_modified.xlsx


In [5]:
import pandas as pd
import os
import glob

# 源文件路径
source_file = "/home/ubuntu/graduation-project/data/EMR/gemma2-9b.xlsx"
source_df = pd.read_excel(source_file)

# 目标目录路径
target_dir = "/home/ubuntu/graduation-project/data/Extract/cn_outcome"
target_files = glob.glob(os.path.join(target_dir, "*.xlsx"))

# 确保源文件中有disease列
if 'disease' not in source_df.columns:
    raise ValueError("源文件中没有disease列")

# 遍历目标目录下所有xlsx文件
for target_file in target_files:
    target_df = pd.read_excel(target_file)
    
    # 确保行数一致
    if len(source_df) != len(target_df):
        print(f"文件 {target_file} 行数不一致，跳过")
        continue
    
    # 修改disease列
    target_df['disease'] = source_df['disease']
    
    # 获取文件名和模型名称
    base_name = os.path.basename(target_file)
    model_name, _ = os.path.splitext(base_name)
    
    # 保存修改后的文件
    output_file = os.path.join(target_dir, f"{model_name}_v2.0.xlsx")
    target_df.to_excel(output_file, index=False)
    print(f"文件 {output_file} 已保存")

print("所有文件处理完成")

文件 /home/ubuntu/graduation-project/data/Extract/outcome/qwen2-0.5b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/llama3.2-1b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/gemma2-9b_final_modified_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/llama3.2-3b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/gemma2-9b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/qwen2-1.5b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/llama3.1-8b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/qwen2-7b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/yi1.5-6b_final_v2.0.xlsx 已保存
文件 /home/ubuntu/graduation-project/data/Extract/outcome/GLMA-9B_final_v2.0.xlsx 已保存
所有文件处理完成


In [2]:
import pandas as pd
import os
from pathlib import Path

def process_age_statistics(input_dir, output_dir):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    for file in os.listdir(input_dir):
        if file.endswith('.xlsx'):
            try:
                input_path = os.path.join(input_dir, file)
                # 读取Excel文件，确保Age列为数值类型
                df = pd.read_excel(input_path)
                
                # 检查Age列是否存在
                if 'Age' not in df.columns:
                    print(f"警告: {file} 中未找到Age列")
                    continue
                
                # 转换Age列为数值类型，无效值填充为NaN
                df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
                
                # 删除Age为空的行
                df = df.dropna(subset=['Age'])
                
                # 创建年龄段区间
                bins = list(range(0, 105, 5))
                labels = [f'{i}-{i+4}' for i in range(0, 100, 5)]
                
                # 将年龄分段
                df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)
                
                # 按疾病分组统计
                grouped = df.groupby('disease')
                result_df = pd.DataFrame()
                
                for disease, group in grouped:
                    age_counts = group['Age_Group'].value_counts().sort_index()
                    total = len(group)
                    percentages = (age_counts / total) * 100
                    result = age_counts.astype(str) + ' (' + percentages.round(2).astype(str) + '%)'
                    result_df[disease] = result
                
                output_filename = f"age_statistics_{os.path.splitext(file)[0]}.xlsx"
                output_path = os.path.join(output_dir, output_filename)
                result_df.to_excel(output_path, index=True)
                print(f"已处理: {file}")
                print(f"结果已保存至: {output_path}")
                
            except Exception as e:
                print(f"处理文件 {file} 时出错: {str(e)}")
                continue

# 执行处理
input_directory = "/home/ubuntu/graduation-project/data/cn-outcome_v3"
output_directory = "/home/ubuntu/graduation-project/data/cn-outcome_age"
process_age_statistics(input_directory, output_directory)

已处理: llama3.1-8b_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_llama3.1-8b_v3.0.xlsx
已处理: qwen2-0.5_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_qwen2-0.5_v3.0.xlsx
已处理: llama3.2-3b_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_llama3.2-3b_v3.0.xlsx
已处理: gemma2-9b_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_gemma2-9b_v3.0.xlsx
已处理: yi1.5-6b_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_yi1.5-6b_v3.0.xlsx
已处理: qwen2-7b_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_qwen2-7b_v3.0.xlsx
已处理: llama3.2-1b_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_llama3.2-1b_v3.0.xlsx
已处理: GLMA-9B_v3.0.xlsx
结果已保存至: /home/ubuntu/graduation-project/data/cn-outcome_age/age_statistics_GLMA-9B_v3.0.xlsx
已处理: qwen2-1.5b_v3.0.xlsx
结果已保存至: /h

In [4]:
import pandas as pd

# 读取 Excel 文件
file_path = "/home/ubuntu/graduation-project/data/cn-outcome_v3/gemma2-9b_v3.0.xlsx"
df = pd.read_excel(file_path)

# 创建年龄段区间
bins = list(range(0, 105, 5))  # 0-100，每5岁一个区间
labels = [f'{i+1}-{i+5}' for i in range(0, 100, 5)]  # 调整labels数量

# 将年龄分段，左开右闭
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

# 按 disease 分组
grouped = df.groupby('disease')

# 创建一个空的 DataFrame 来存储结果
result_df = pd.DataFrame()

# 统计每个分组的年龄段个数和百分比
for disease, group in grouped:
    age_group_counts = group['Age_Group'].value_counts().sort_index()
    total_count = len(group)
    age_group_percentages = (age_group_counts / total_count) * 100
    result = age_group_counts.astype(str) + ' (' + age_group_percentages.round(2).astype(str) + '%)'
    
    # 将结果添加到 result_df 中
    result_df[disease] = result

# 输出统计结果到新的 Excel 文件
output_path = "/home/ubuntu/graduation-project/data/statasticsgemma2-9b_age_group_statistics.xlsx"
# result_df.to_excel(output_path, index=True)

print(f"统计结果已保存至: {output_path}")

统计结果已保存至: /home/ubuntu/graduation-project/data/statasticsgemma2-9b_age_group_statistics.xlsx


In [6]:
import pandas as pd
import os
import glob

# 创建输出目录
output_dir = "/home/ubuntu/graduation-project/data/cn-outcome_age_v4"
os.makedirs(output_dir, exist_ok=True)

# 获取所有Excel文件
input_dir = "/home/ubuntu/graduation-project/data/cn-outcome_v3.2"
excel_files = glob.glob(os.path.join(input_dir, "*.xlsx"))

# 创建年龄段区间
bins = list(range(0, 105, 5))
labels = [f'{i+1}-{i+5}' for i in range(0, 100, 5)]

def process_age_column(df, column_name):
    # 将年龄分段
    df[f'{column_name}_Group'] = pd.cut(df[column_name], bins=bins, labels=labels, 
                                       right=True, include_lowest=True)
    
    # 按disease分组统计
    result_df = pd.DataFrame()
    for disease, group in df.groupby('disease'):
        age_counts = group[f'{column_name}_Group'].value_counts().sort_index()
        total_count = len(group)
        age_percentages = (age_counts / total_count) * 100
        result = age_counts.astype(str) + ' (' + age_percentages.round(2).astype(str) + '%)'
        result_df[disease] = result
    
    return result_df

# 处理每个文件
for file_path in excel_files:
    file_name = os.path.basename(file_path)
    base_name = os.path.splitext(file_name)[0]
    
    # 读取Excel文件
    df = pd.read_excel(file_path)
    
    # 处理Age列
    age_stats = process_age_column(df, 'Age')
    age_output = os.path.join(output_dir, f'{base_name}_age_statistics.xlsx')
    age_stats.to_excel(age_output, index=True)
    
    # 处理Age_Chinese列
    age_chinese_stats = process_age_column(df, 'Age_Chinese')
    age_chinese_output = os.path.join(output_dir, f'{base_name}_age_chinese_statistics.xlsx')
    age_chinese_stats.to_excel(age_chinese_output, index=True)
    
    print(f"已处理文件: {file_name}")
    print(f"Age统计结果保存至: {age_output}")
    print(f"Age_Chinese统计结果保存至: {age_chinese_output}")

已处理文件: yi1.5-6b_v3.2.xlsx
Age统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/yi1.5-6b_v3.2_age_statistics.xlsx
Age_Chinese统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/yi1.5-6b_v3.2_age_chinese_statistics.xlsx
已处理文件: gemma2-9b_v3.2.xlsx
Age统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/gemma2-9b_v3.2_age_statistics.xlsx
Age_Chinese统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/gemma2-9b_v3.2_age_chinese_statistics.xlsx
已处理文件: qwen2-7b_v3.2.xlsx
Age统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/qwen2-7b_v3.2_age_statistics.xlsx
Age_Chinese统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/qwen2-7b_v3.2_age_chinese_statistics.xlsx
已处理文件: qwen2-0.5_v3.2.xlsx
Age统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/qwen2-0.5_v3.2_age_statistics.xlsx
Age_Chinese统计结果保存至: /home/ubuntu/graduation-project/data/cn-outcome_age_v4/qwen2-0.5_v3.2_age_chinese_statistics.xlsx
已处理文件: GLMA-9B_v3.

gemma2-9b_v3.2_age_statistics.xlsx
GLMA-9B_v3.2_age_chinese_statistics.xlsx
llama3.2-1b_v3.2_age_statistics.xlsx
qwen2-1.5b_v3.2_age_statistics.xlsx
GLMA-9B_v3.2_age_statistics.xlsx
llama3.2-1b_v3.2_age_chinese_statistics.xlsx
qwen2-7b_v3.2_age_chinese_statistics.xlsx
qwen2-0.5_v3.2_age_statistics.xlsx
qwen2-7b_v3.2_age_statistics.xlsx
qwen2-1.5b_v3.2_age_chinese_statistics.xlsx
yi1.5-6b_v3.2_age_statistics.xlsx
gemma2-9b_v3.2_age_chinese_statistics.xlsx
llama3.2-3b_v3.2_age_statistics.xlsx
yi1.5-6b_v3.2_age_chinese_statistics.xlsx
llama3.1-8b_v3.2_age_chinese_statistics.xlsx
qwen2-0.5_v3.2_age_chinese_statistics.xlsx
llama3.1-8b_v3.2_age_statistics.xlsx
llama3.2-3b_v3.2_age_chinese_statistics.xlsx


In [None]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('age.csv')

# 显示数据
print(df)

# 保存数据到新的CSV文件
df.to_csv('saved_data.csv', index=False)

print("数据已保存到 saved_data.csv")