## 统计分析

通过指定统计分析字段，得到每个特征的p_value，所有的p_value计算都是基于Ttest计算。支持指定不同的分组`group`，例如train、val、test等分组统计。

对于两大类不同的特征

1. 离散特征，统计数量以及占比。
2. 连续特征，统计均值、方差。

In [1]:
import pandas as pd
import os
from onekey_algo import OnekeyDS as okds

clinic_data = pd.read_csv(r'clinic.csv')
mydir = r'D:\20220421-FANBIN'
labelf = os.path.join(mydir, 'label.csv')
label_data = pd.read_csv(labelf)
label_data[['ID']] = label_data[['ID']].applymap(lambda x: f"{x}.nii.gz")
label_data[['ID']] = label_data[['ID']].applymap(lambda x: x.replace('-m', '').replace('-f', ''))
class_mapping = {'I': 0, "IIA": 1, "IIB": 2, 'III':3, 'IV':4, 'V':5, 'VI':5}
label_data[['label']] = label_data[['label']].applymap(lambda x: class_mapping[x])

train_data = pd.read_csv('results/RandomForest_train.csv')
train_data['group'] = 'train'
test_data = pd.read_csv('results/RandomForest_test.csv')
test_data['group'] = 'test'
data = pd.concat([train_data, test_data], axis=0)
data[['ID']] = data[['ID']].applymap(lambda x: x.replace('-m', '').replace('-f', ''))

# 指定训练集、测试集，真实情况可以自己定义好
# val_ = int(test_data.shape[0] * 0.5)
clinic_data[['ID']] = clinic_data[['ID']].applymap(lambda x: f"{x}.nii.gz")
clinic_data.drop_duplicates(subset=['ID'], keep='first', inplace=True, ignore_index=False)
clinic_data = pd.merge(clinic_data, data, on='ID', how='inner')

combined_data = pd.merge(label_data, clinic_data, on='ID', how='inner')
print(combined_data.columns)
combined_data

Index(['ID', 'label', 'Gender', 'Age', 'label-0', 'label-1', 'label-2',
       'label-3', 'label-4', 'label-5', 'group'],
      dtype='object')


Unnamed: 0,ID,label,Gender,Age,label-0,label-1,label-2,label-3,label-4,label-5,group
0,17026055.nii.gz,1,M,56,0.1,0.9,0.0,0.0,0.0,0.0,train
1,17098412.nii.gz,5,F,67,0.2,0.0,0.0,0.0,0.0,0.8,test
2,17113727.nii.gz,3,F,56,1.0,0.0,0.0,0.0,0.0,0.0,train
3,17141660.nii.gz,1,F,52,0.7,0.0,0.2,0.0,0.0,0.1,train
4,18001036.nii.gz,0,M,65,0.1,0.8,0.1,0.0,0.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...
115,yuqingyan.nii.gz,5,F,55,0.0,0.6,0.2,0.1,0.0,0.1,train
116,zhanghuachen.nii.gz,0,M,30,1.0,0.0,0.0,0.0,0.0,0.0,train
117,zhanghuayi.nii.gz,0,M,48,0.3,0.0,0.7,0.0,0.0,0.0,train
118,zhangyong.nii.gz,5,M,45,0.0,0.0,0.1,0.0,0.0,0.9,train


In [2]:
from onekey_algo.custom.utils import map2numerical

info = map2numerical(combined_data,mapping_columns=['Gender'], inplace=True)
combined_data

Unnamed: 0,ID,label,Gender,Age,label-0,label-1,label-2,label-3,label-4,label-5,group
0,17026055.nii.gz,1,1,56,0.1,0.9,0.0,0.0,0.0,0.0,train
1,17098412.nii.gz,5,0,67,0.2,0.0,0.0,0.0,0.0,0.8,test
2,17113727.nii.gz,3,0,56,1.0,0.0,0.0,0.0,0.0,0.0,train
3,17141660.nii.gz,1,0,52,0.7,0.0,0.2,0.0,0.0,0.1,train
4,18001036.nii.gz,0,1,65,0.1,0.8,0.1,0.0,0.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...
115,yuqingyan.nii.gz,5,0,55,0.0,0.6,0.2,0.1,0.0,0.1,train
116,zhanghuachen.nii.gz,0,1,30,1.0,0.0,0.0,0.0,0.0,0.0,train
117,zhanghuayi.nii.gz,0,1,48,0.3,0.0,0.7,0.0,0.0,0.0,train
118,zhangyong.nii.gz,5,1,45,0.0,0.0,0.1,0.0,0.0,0.9,train


In [3]:
set(data['ID']) - set(clinic_data['ID'])

{'00176749.nii.gz',
 '201803010095.nii.gz',
 'chuping-R.nii.gz',
 'liganghua.nii.gz'}

### 输出格式
支持两种格式数据，分别对应`pretty`参数的`True`和`False`, 当为`True`时，输出的是表格模式，反之则为dict数据。

```python
def clinic_stats(data: DataFrame, stats_columns: Union[str, List[str]], label_column='label',
                 group_column: str = None, continuous_columns: Union[str, List[str]] = None,
                 pretty: bool = True) -> Union[dict, DataFrame]:
    """

    Args:
        data: 数据
        stats_columns: 需要统计的列名
        label_column: 二分类的标签列，默认`label`
        group_column: 分组统计依据，例如区分训练组、测试组、验证组。
        continuous_columns: 那些列是连续变量，连续变量统计均值方差。
        pretty: bool, 是否对结果进行格式美化。

    Returns:
        stats DataFrame or json

    """
```

In [8]:
from onekey_algo.custom.components.stats import clinic_stats
stats = clinic_stats(combined_data, 
                     stats_columns=['Age', 'Gender'],
                     label_column='label', 
                     group_column=None, 
                     continuous_columns=['Age'], 
                     pretty=True)
stats.to_csv('stats.csv', header=True, index=False, encoding='gbk')
stats

Unnamed: 0,feature_name,-label=ALL,-label=0,-label=1,-label=2,-label=3,-label=4,-label=5,pvalue
0,Age,49.1083±11.4435,48.1224±11.7697,50.0000±9.2482,52.2500±14.0280,52.5714±2.2254,40.7778±13.8002,51.1034±11.3148,0.623909
1,Gender,,,,,,,,0.277561
2,0,52(0.4333),16(0.3265),10(0.5556),3(0.3750),5(0.7143),5(0.5556),13(0.4483),
3,1,68(0.5667),33(0.6735),8(0.4444),5(0.6250),2(0.2857),4(0.4444),16(0.5517),


In [5]:
clinic_stats(combined_data, 
                     stats_columns=['Age', 'Gender'],
                     label_column='label', 
                     group_column='group', 
                     continuous_columns=['Age'], 
                     pretty=False)

{'Age': {'test': {'mean': 48.96774193548387,
   'std': 11.847598549826436,
   '__pvalue__': 0.22406284497183407,
   'mean | label=5': 49.666666666666664,
   'std | label=5': 11.413442367080435,
   'mean | label=1': 53.0,
   'std | label=1': 10.64581294844754,
   'mean | label=0': 50.86666666666667,
   'std | label=0': 10.50759362601751,
   'mean | label=4': 29.666666666666668,
   'std | label=4': 14.224392195567912,
   'mean | label=3': 52.0,
   'std | label=3': 1.0},
  'train': {'mean': 49.157303370786515,
   'std': 11.367716325794662,
   '__pvalue__': 0.1882081098007302,
   'mean | label=1': 49.142857142857146,
   'std | label=1': 9.062663412698727,
   'mean | label=3': 53.0,
   'std | label=3': 2.943920288775949,
   'mean | label=0': 46.911764705882355,
   'std | label=0': 12.235981829706123,
   'mean | label=4': 46.333333333333336,
   'std | label=4': 10.614455552060438,
   'mean | label=2': 52.25,
   'std | label=2': 14.02803315813426,
   'mean | label=5': 51.47826086956522,
   's