In [181]:
import os
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import BayesianRidge

warnings.filterwarnings('ignore')

import pandas as pd


In [215]:
file_paths = [
    r"C:\Users\user\Documents\RCC_Project\kidneytumor_project\Preprocessing\dataset\AML_output_3.csv",
    r"C:\Users\user\Documents\RCC_Project\kidneytumor_project\Preprocessing\dataset\ccRCC_output_3.csv",
    r"C:\Users\user\Documents\RCC_Project\kidneytumor_project\Preprocessing\dataset\chRCC_output_3.csv",
    r"C:\Users\user\Documents\RCC_Project\kidneytumor_project\Preprocessing\dataset\pRCC_output_3.csv",
    r"C:\Users\user\Documents\RCC_Project\kidneytumor_project\Preprocessing\dataset\RO_output_3.csv"
]

dataframes = [pd.read_csv(file, index_col=0) for file in file_paths]

dataset_labels = ['benign', 'ccRCC', 'nccRCC', 'nccRCC', 'benign']

In [212]:
dataframes = []
for file, label in zip(file_paths, dataset_labels):
    df = pd.read_csv(file)
    df['Dataset_Type'] = label
    dataframes.append(df)

In [218]:
common_columns = set(dataframes[0].columns)
for df in dataframes[1:]:
    common_columns &= set(df.columns)

# Ensure columns are ordered consistently
ordered_common_columns = [col for col in dataframes[0].columns if col in common_columns]

# Retain only common columns initially
filtered_dataframes = [df[ordered_common_columns].copy() for df in dataframes]

In [219]:
synonym_dict = {
    '丙肝抗体(C)': '丙肝抗体',
    '乙肝E抗体': '乙肝e抗体(YP)',
    '乙肝E抗原': '乙肝e抗原(YP)',
    '乙肝核心抗体': '乙肝核心抗体(YP)',
    '乙肝表面抗体': '乙肝表面抗体(YP)',
    '乙肝表面抗原': '乙肝表面抗原(YP)',
    '梅毒螺旋体抗体': '梅毒确诊试验',
    '梅毒甲苯胺红不加热血清试验(TRUST)': '梅毒确诊试验',
    'Th淋巴细胞(CD3+CD4+)': 'Th淋巴细胞CD4',
    'Th淋巴细胞（CD3+CD4+）': 'Th淋巴细胞CD4',
    'Ts淋巴细胞  CD8': 'Ts淋巴细胞CD8',
    'Ts淋巴细胞(CD3+CD8+)': 'Ts淋巴细胞CD8',
    'CD8+CD38+': 'Ts淋巴细胞CD8',
    'T淋巴细胞（CD3+）': 'T淋巴细胞(CD3+)',
    'T淋巴细胞CD3': 'T淋巴细胞(CD3+)',
    'B淋巴细胞（CD3-CD19+）': 'B淋巴细胞(CD3-CD19+)',
    'B淋巴细胞CD19': 'B淋巴细胞(CD3-CD19+)',
    '淋巴细胞绝对值（CD45+）': '淋巴细胞绝对值',
    '淋巴细胞绝对值（CD45）': '淋巴细胞绝对值',
    '自然杀伤细胞(CD3-CD16+CD56+)': '自然杀伤细胞CD56+CD16',
    'CD3+HLA-DR+': 'CD3+HLA-DR+/CD3+(%)',
    'CD8+HLA-DR+': 'CD8+HLA-DR+/CD8+(%)',
    'INF-r': 'γ干扰素',
    'IFN-γ': 'γ干扰素',
    'IFN-α': 'α干扰素',
    'IL-6': '白细胞介素-6',
    '白介素-6(IL-6)': '白细胞介素-6',
    'IL-12P70': '白细胞介素-12p70',
    'IL-1β': '白细胞介素-1β',
    'IL-5': '白细胞介素-5',
    'IL-8': '白细胞介素-8',
    'C反应蛋白': 'C-反应蛋白',
    '肌酸激酶同工酶(质量法)': '肌酸激酶同工酶',
    '肌酸激酶': '肌酸激酶(CK)',
    '真菌': '酵母菌',
    '尿沉渣上皮细胞': '尿上皮细胞计数',
    '尿沉渣白细胞': '尿沉渣白细胞计数',
    '尿沉渣红细胞': '尿沉渣红细胞计数',
    '病理管型': '病理性管型',
    '白细胞': '镜检白细胞',
    '管型': '镜检管型',
    '红细胞': '镜检红细胞',
    '总蛋白': '总蛋白(TP)',
    '白球比例': '白球比例(A:G)',
    'eGFR-EPIcysc': '胱抑素C(CysC)',
    '尿白蛋白肌酐比': 'ACR比值',
    '尿素氮': '尿素氮(BUN)',
    '葡萄糖(GLU)': '空腹血糖(GLU)',
    '血小板最大聚集率': '血小板最大聚集率(AA)',
    '血小板粘附率': '血小板粘附率(AA)',
    '异常红细胞形态检测': '异常红细胞形态检测(AA)',
    '异常血小板形态检测': '异常血小板形态检测(AA)',
    '血小板计数初始值': '血小板计数初始值(AA)',
    '红细胞平均体积初始': '红细胞计数初始值(AA)',
    '红细胞计数初始值': '红细胞计数初始值(ADP)',
}

In [220]:
standardized_dfs = [df.rename(columns=synonym_dict) for df in filtered_dataframes]

In [221]:
for i, df in enumerate(standardized_dfs):
    df = df.loc[:, ~df.columns.duplicated()]
    standardized_dfs[i] = df

In [222]:
merged_df = pd.concat(standardized_dfs, ignore_index=True)

missing_threshold = 0.25
filtered_df = merged_df.loc[:, merged_df.isnull().mean() <= missing_threshold]

In [189]:
filtered_df.to_csv('df_before_robust_clean_value.csv', encoding = 'utf-8-sig', index=False)

In [223]:
import pandas as pd
import numpy as np
import re

# ------------------- STEP 1: Basic Symbol Cleaning -------------------
filtered_df.replace({'<': '', '>': '', '╋': '阳性'}, regex=True, inplace=True)
filtered_df.replace({'未查见': 0, '': 0}, inplace=True)

# ------------------- STEP 2: Auto-Detect Column Types -------------------

# Check numeric percentage in columns
def is_numeric(val):
    try:
        float(str(val).strip())
        return True
    except:
        return False

numeric_cols = [col for col in filtered_df.columns if filtered_df[col].apply(is_numeric).mean() > 0.8]
categorical_cols = [col for col in filtered_df.columns if col not in numeric_cols]

# ------------------- STEP 3: Clean Numeric Columns -------------------
def clean_numeric_value(x):
    x = str(x).strip()
    if re.match(r'^\d+\-\d+$', x):
        x = x.split('-')[0].strip()
    if ',' in x:
        x = x.split(',')[-1].strip()
    try:
        cleaned_val = re.sub(r'[^\d.]', '', x)
        return float(cleaned_val) if cleaned_val else 0.0
    except:
        return 0.0

for col in numeric_cols:
    filtered_df[col] = filtered_df[col].apply(clean_numeric_value)

# ------------------- STEP 4: Clean Categorical Columns -------------------
def clean_categorical_value(x):
    x = str(x).strip()
    if '阴性' in x or '-' in x or 'neg' in x:
        return '阴性'
    elif '阳性' in x or '+' in x or '±' in x or 'norm' in x or 'normal' in x:
        return '阳性'
    elif '正常' in x:
        return '正常'
    elif x == '':
        return np.nan
    return x  # Keep original if it doesn't match any pattern

for col in categorical_cols:
    filtered_df[col] = filtered_df[col].apply(clean_categorical_value)

# ------------------- STEP 5: Re-Classify Columns -------------------
numeric_cols = filtered_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = filtered_df.select_dtypes(include=['object']).columns.tolist()


In [224]:
scaler = StandardScaler()
filtered_df[numeric_cols] = scaler.fit_transform(filtered_df[numeric_cols])

filtered_df[categorical_cols]

Unnamed: 0,ABO血型,HIV Ag/Ab,Rh(C)血型,Rh(D)血型,Rh(E)血型,TEG凝血因子活性,TEG纤溶系统,TEG纤维蛋白原水平,TEG血小板功能,丙型肝炎病毒核心抗原,...,尿潜血,尿白细胞酯酶,尿胆原,尿胆红素,尿葡萄糖,尿蛋白质,尿酮体,尿颜色,抗体筛选,检测类型
0,B,阴性,阳性,阳性,阳性,正常,正常,正常,正常,阴性,...,阳性,阴性,阳性,阴性,阴性,阳性,阴性,琥珀色,阴性,普通检测
1,B,阴性,阳性,阳性,阳性,正常,正常,正常,正常,阴性,...,阳性,阴性,阳性,阴性,阴性,阴性,阴性,稻黄色,阴性,普通检测
2,B,阴性,阴性,阳性,阳性,正常,正常,正常,正常,阴性,...,阴性,阴性,阳性,阴性,阴性,阴性,阴性,稻黄色,阴性,普通检测
3,B,阴性,阳性,阳性,阴性,正常,正常,正常,正常,阴性,...,阴性,阴性,阳性,阴性,阳性,阴性,阴性,浅黄色,阴性,普通检测
4,A,阴性,阳性,阳性,阳性,正常,正常,正常,增高,阴性,...,阴性,阴性,阳性,阴性,阴性,阴性,阴性,稻黄色,阴性,普通检测
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,O,阴性,阳性,阳性,阴性,正常,正常,增高,正常,阴性,...,阳性,阴性,阳性,阴性,阴性,阳性,阴性,,阴性,普通检测
773,A,阴性,阳性,阳性,阴性,,,,,阴性,...,阴性,阴性,阳性,阴性,阴性,阴性,阴性,,阴性,
774,A,阴性,阳性,阳性,阴性,正常,正常,正常,正常,,...,阴性,阴性,阳性,阴性,阴性,阳性,阴性,稻黄色,,普通检测
775,A,阴性,阳性,阳性,阳性,正常,正常,正常,正常,阴性,...,阴性,阳性,阳性,阴性,阴性,阴性,阴性,稻黄色,阴性,普通检测


In [230]:
output_dir = r'C:\Users\user\Documents\RCC_Project\kidneytumor_project\Preprocessing\output'
# Save the DataFrame to a CSV file
filtered_df.to_csv(os.path.join(output_dir, 'cleaned_output.csv'), encoding='utf-8-sig', index=False)