# WHO死亡率数据初步探索
## MSAI小组项目 - 第一部分

In [None]:
# 导入必要的库
import sys
sys.path.append('..')

from src.data_processing import WHODataProcessor
from src.statistical_analysis import ClassicalStatistics

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 设置显示参数
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 1. 数据加载和处理

In [None]:
# 初始化数据处理器
processor = WHODataProcessor('../data/raw/ghe2021_deaths_global_new2.xlsx')

# 加载数据
raw_data = processor.load_data()
print(f"原始数据形状: {raw_data.shape}")

In [None]:
# 处理数据
df = processor.process_data()
df.head(10)

## 2. 数据概览

In [None]:
# 基本信息
print("数据集信息:")
print(f"总记录数: {len(df)}")
print(f"死因数量: {df['cause_name'].nunique()}")
print(f"年龄组: {df['age_group'].unique()}")
print(f"\n数据类型:")
print(df.dtypes)

In [None]:
# 缺失值检查
print("缺失值统计:")
print(df.isnull().sum())

## 3. 描述性统计

In [None]:
# 摘要统计
summary = processor.get_summary_stats()

print(f"总死亡人数: {summary['total_deaths']:,.0f}")
print(f"男性死亡: {summary['male_deaths']:,.0f}")
print(f"女性死亡: {summary['female_deaths']:,.0f}")
print(f"\n男女比例: {summary['male_deaths']/summary['female_deaths']:.2f}")

## 4. 可视化分析

In [None]:
# Top 10 死因
top_causes = df.groupby('cause_name')['both_sexes'].sum().nlargest(10)

plt.figure(figsize=(12, 6))
top_causes.plot(kind='barh')
plt.title('Top 10 Leading Causes of Death (2021)', fontsize=14)
plt.xlabel('Number of Deaths (millions)')
plt.tight_layout()
plt.show()

In [None]:
# 年龄分布
age_dist = df.groupby('age_group')['both_sexes'].sum()

plt.figure(figsize=(10, 6))
age_dist.plot(kind='bar', color='steelblue')
plt.title('Deaths by Age Group', fontsize=14)
plt.xlabel('Age Group')
plt.ylabel('Number of Deaths')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 性别差异热力图
gender_ratio = df.pivot_table(
    values='male_female_ratio',
    index='cause_name',
    columns='age_group',
    aggfunc='mean'
)

# 选择top 15死因
top_15_causes = df.groupby('cause_name')['both_sexes'].sum().nlargest(15).index
gender_ratio_top = gender_ratio.loc[top_15_causes]

plt.figure(figsize=(12, 8))
sns.heatmap(gender_ratio_top, annot=True, fmt='.2f', cmap='RdBu_r', center=1)
plt.title('Male/Female Death Ratio by Cause and Age Group', fontsize=14)
plt.tight_layout()
plt.show()

## 5. 统计检验

In [None]:
# 运行统计分析
stats_analyzer = ClassicalStatistics(df)
results = stats_analyzer.run_all_tests()

## 6. 结果总结

In [None]:
# 创建结果总结表
summary_table = pd.DataFrame([
    {
        'Test': 'Gender T-Test',
        'Statistic': results['gender_ttest']['t_statistic'],
        'P-Value': results['gender_ttest']['p_value'],
        'Significant': results['gender_ttest']['significant']
    },
    {
        'Test': 'Age Group ANOVA',
        'Statistic': results['age_anova']['f_statistic'],
        'P-Value': results['age_anova']['p_value'],
        'Significant': results['age_anova']['significant']
    },
    {
        'Test': 'Chi-Square Independence',
        'Statistic': results['chi_square']['chi2_statistic'],
        'P-Value': results['chi_square']['p_value'],
        'Significant': results['chi_square']['significant']
    }
])

print("\n统计检验结果总结:")
print(summary_table.to_string(index=False))