In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
female_data = pd.read_excel(r'女胎.xlsx')
female_data.columns

Index(['序号', '孕妇代码', '年龄', '身高', '体重', '末次月经', 'IVF妊娠', '检测日期', '检测抽血次数',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'Unnamed: 20',
       'Unnamed: 21', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例', '染色体的非整倍体', '怀孕次数', '生产次数', '胎儿是否健康'],
      dtype='object')

In [4]:
female_data['染色体的非整倍体'] = female_data['染色体的非整倍体'].apply(lambda x: 1 if pd.notna(x) and x != '' else 0)

In [5]:
female_data['孕妇BMI'] = pd.to_numeric(female_data['孕妇BMI'], errors='coerce')

# 方法 1：使用最频繁的日期填补
mean_value = female_data['孕妇BMI'].mean()  # 找到最常见的日期
female_data['孕妇BMI'] = female_data['孕妇BMI'].fillna(mean_value)
print(female_data['孕妇BMI'].isnull().sum())

0


In [6]:
bins = [25, 30, 35, 40] 
labels = ['[25,30)', '[30,35)', '[35,40)']  # 标签
female_data['BMI_group'] = pd.cut(female_data['孕妇BMI'], bins=bins, labels=labels, right=False)

In [21]:
from scipy.stats import spearmanr
for group in labels:
    group_data = female_data[female_data['BMI_group'] == group]  # 按照BMI组筛选数据
    spearman_corr_group = group_data[['年龄', '身高', '体重',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例']].apply(
        lambda x: spearmanr(x, group_data['染色体的非整倍体'])[0], axis=0)  # 计算相关性
    print(f"\nBMI组 {group} 的相关性：")
    print(spearman_corr_group)


BMI组 [25,30) 的相关性：
年龄              0.050858
身高              0.080063
体重             -0.010194
检测孕周            0.225415
孕妇BMI          -0.150456
原始读段数          -0.075212
在参考基因组上比对的比例   -0.055506
重复读段的比例         0.052878
唯一比对的读段数       -0.080467
GC含量            0.042368
13号染色体的Z值      -0.047623
18号染色体的Z值       0.027260
21号染色体的Z值       0.094918
X染色体的Z值        -0.067330
X染色体浓度         -0.290010
13号染色体的GC含量    -0.121193
18号染色体的GC含量    -0.126448
21号染色体的GC含量    -0.170459
被过滤掉读段数的比例      0.054192
dtype: float64

BMI组 [30,35) 的相关性：
年龄             -0.078312
身高             -0.094401
体重             -0.108342
检测孕周            0.020069
孕妇BMI          -0.044871
原始读段数          -0.025446
在参考基因组上比对的比例    0.024662
重复读段的比例        -0.036138
唯一比对的读段数       -0.016323
GC含量           -0.029937
13号染色体的Z值      -0.035140
18号染色体的Z值       0.041199
21号染色体的Z值       0.055027
X染色体的Z值        -0.068570
X染色体浓度         -0.284258
13号染色体的GC含量     0.080972
18号染色体的GC含量     0.075912
21号染色体的GC含量     0.023522
被过滤掉读段数的比例     -0.05

In [22]:
feature_columns = ['年龄', '身高', '体重',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例']

In [114]:
feature_columns_1 = ['身高', '体重',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例', '染色体的非整倍体']

In [23]:
bmi_group_25_30 = female_data[(female_data['孕妇BMI'] >= 25) & (female_data['孕妇BMI'] < 30)]

# 获取特征列和目标变量
X = bmi_group_25_30[['年龄', '身高', '体重',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例']]  # 特征列
Y = bmi_group_25_30['染色体的非整倍体']  # 目标变量，替换为实际的目标列名

# 计算Spearman相关性系数和p-value
for feature in feature_columns:
    X = bmi_group_25_30[feature]  # 单独选择特征列
    corr, p_val = spearmanr(X, Y)  # 计算相关性和p-value
    print(f"Feature: {feature}")
    print(f"Spearman Correlation: {corr}")
    print(f"p-value: {p_val}")
    print("\n")

Feature: 年龄
Spearman Correlation: 0.050857518701418455
p-value: 0.5506772203934521


Feature: 身高
Spearman Correlation: 0.08006347234538112
p-value: 0.3470425671849714


Feature: 体重
Spearman Correlation: -0.010193566688149568
p-value: 0.9048525689379189


Feature: 检测孕周
Spearman Correlation: 0.22541497571596164
p-value: 0.0074109950806751914


Feature: 孕妇BMI
Spearman Correlation: -0.15045554173713438
p-value: 0.0760022641825039


Feature: 原始读段数
Spearman Correlation: -0.07521214168771453
p-value: 0.3771307569384421


Feature: 在参考基因组上比对的比例
Spearman Correlation: -0.05550590369093343
p-value: 0.5148083023282839


Feature: 重复读段的比例
Spearman Correlation: 0.052878405291362625
p-value: 0.5349347925002548


Feature: 唯一比对的读段数
Spearman Correlation: -0.08046713848685616
p-value: 0.3446090245773573


Feature: GC含量
Spearman Correlation: 0.042368411693079366
p-value: 0.6191618204927657


Feature: 13号染色体的Z值
Spearman Correlation: -0.04762340849222099
p-value: 0.5763302865555262


Feature: 18号染色体的Z值
Spearm

In [24]:
bmi_group_30_35 = female_data[(female_data['孕妇BMI'] >= 30) & (female_data['孕妇BMI'] < 35)]

# 获取特征列和目标变量
X = bmi_group_30_35[['年龄', '身高', '体重',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例']]  # 特征列
Y = bmi_group_30_35['染色体的非整倍体']  # 目标变量，替换为实际的目标列名

# 计算Spearman相关性系数和p-value
for feature in feature_columns:
    X = bmi_group_30_35[feature]  # 单独选择特征列
    corr, p_val = spearmanr(X, Y)  # 计算相关性和p-value
    print(f"Feature: {feature}")
    print(f"Spearman Correlation: {corr}")
    print(f"p-value: {p_val}")
    print("\n")

Feature: 年龄
Spearman Correlation: -0.07831233895686271
p-value: 0.1300829854203541


Feature: 身高
Spearman Correlation: -0.09440104663029356
p-value: 0.0678434475340505


Feature: 体重
Spearman Correlation: -0.10834224412174044
p-value: 0.03597488405300632


Feature: 检测孕周
Spearman Correlation: 0.02006926455394677
p-value: 0.6984740197908771


Feature: 孕妇BMI
Spearman Correlation: -0.044871182259074716
p-value: 0.3862366782972557


Feature: 原始读段数
Spearman Correlation: -0.025446390208012582
p-value: 0.6232838017121657


Feature: 在参考基因组上比对的比例
Spearman Correlation: 0.024662327764628438
p-value: 0.6340302456928086


Feature: 重复读段的比例
Spearman Correlation: -0.03613815079961451
p-value: 0.48536237682430683


Feature: 唯一比对的读段数
Spearman Correlation: -0.016322754503178938
p-value: 0.7527202560488465


Feature: GC含量
Spearman Correlation: -0.029936929656485395
p-value: 0.5633179245102726


Feature: 13号染色体的Z值
Spearman Correlation: -0.03514025314439833
p-value: 0.49750172134251125


Feature: 18号染色体的Z值
Sp

In [25]:
bmi_group_35_40 = female_data[(female_data['孕妇BMI'] >= 35) & (female_data['孕妇BMI'] < 40)]

# 获取特征列和目标变量
X = bmi_group_35_40[['年龄', '身高', '体重',
       '检测孕周', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量',
       '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量',
       '被过滤掉读段数的比例']]  # 特征列
Y = bmi_group_35_40['染色体的非整倍体']  # 目标变量，替换为实际的目标列名

# 计算Spearman相关性系数和p-value
for feature in feature_columns:
    X = bmi_group_35_40[feature]  # 单独选择特征列
    corr, p_val = spearmanr(X, Y)  # 计算相关性和p-value
    print(f"Feature: {feature}")
    print(f"Spearman Correlation: {corr}")
    print(f"p-value: {p_val}")
    print("\n")

Feature: 年龄
Spearman Correlation: -0.023905621321472115
p-value: 0.8365024116005199


Feature: 身高
Spearman Correlation: -0.005314031143503216
p-value: 0.9634154270226263


Feature: 体重
Spearman Correlation: -0.034299177476656305
p-value: 0.7671244200368815


Feature: 检测孕周
Spearman Correlation: -0.09748700703540134
p-value: 0.39896862836982405


Feature: 孕妇BMI
Spearman Correlation: -0.07640420971762293
p-value: 0.5089685704040666


Feature: 原始读段数
Spearman Correlation: 0.21063720666668628
p-value: 0.06594596188385642


Feature: 在参考基因组上比对的比例
Spearman Correlation: -0.06319116200000588
p-value: 0.5850819402884173


Feature: 重复读段的比例
Spearman Correlation: -0.1974723812500184
p-value: 0.08516623779843062


Feature: 唯一比对的读段数
Spearman Correlation: 0.1974723812500184
p-value: 0.08516623779843062


Feature: GC含量
Spearman Correlation: 0.10268563825000956
p-value: 0.37417671836257826


Feature: 13号染色体的Z值
Spearman Correlation: -0.1369141843333461
p-value: 0.23508274301013904


Feature: 18号染色体的Z值
Spear

In [26]:
significant_feature = ['孕妇BMI', '检测孕周', '体重', '重复读段的比例', 'GC含量', '唯一比对的读段数', '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值']

In [27]:
significant_feature_1 = ['孕妇BMI', '检测孕周', '体重', '重复读段的比例', 'GC含量', '唯一比对的读段数', '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', '染色体的非整倍体']

In [28]:
spearman_corr, p_values = spearmanr(female_data[significant_feature], female_data['染色体的非整倍体'])

In [29]:
spearman_corr_with_target = spearman_corr[:, -1]  # 获取与目标变量的 Spearman 相关系数
p_values_with_target = p_values[:, -1]  # 获取与目标变量的 p 值

In [30]:
corr_df = pd.DataFrame({
    'Feature': significant_feature_1,
    'Spearman Correlation': spearman_corr[0, :],
    'p-value': p_values[0, :].round(5)
})



In [31]:
# 计算显著性分数 N_sigi，p 值小于 0.05 的特征
corr_df['N_sigi'] = (corr_df['p-value'] < 0.05).astype(int)

# 计算高相关性分数 N_high，相关系数的绝对值大于 0.15 的特征
corr_df['N_high'] = (np.abs(corr_df['Spearman Correlation']) >= 0.15).astype(int)

# 综合得分 S_i = |r_i| + 0.3 * N_sigi + 0.2 * N_high
corr_df['S_i'] = np.abs(corr_df['Spearman Correlation']) + 0.3 * corr_df['N_sigi'] + 0.2 * corr_df['N_high']

# 排序并显示特征及其得分
sorted_features = corr_df.sort_values(by='S_i', ascending=False)
print(sorted_features[['Feature', 'Spearman Correlation', 'p-value', 'N_sigi', 'N_high', 'S_i']])

     Feature  Spearman Correlation  p-value  N_sigi  N_high       S_i
0      孕妇BMI              1.000000  0.00000       1       1  1.500000
2         体重              0.769966  0.00000       1       1  1.269966
1       检测孕周              0.229999  0.00000       1       1  0.729999
7  18号染色体的Z值              0.155943  0.00012       1       1  0.655943
5   唯一比对的读段数             -0.115136  0.00457       1       0  0.415136
8  21号染色体的Z值              0.095258  0.01910       1       0  0.395258
4       GC含量             -0.089434  0.02783       1       0  0.389434
6  13号染色体的Z值              0.074575  0.06679       0       0  0.074575
3    重复读段的比例              0.059134  0.14629       0       0  0.059134
9   染色体的非整倍体             -0.034411  0.39817       0       0  0.034411
