In [1]:
import pandas as pd

patient_path = '../brca_tcga_pub2015/data_clinical_patient.txt'
patient_df = pd.read_csv(patient_path, sep='\t', index_col=0)

In [19]:
survival_list = ['Overall Survival Status',
       'Overall Survival (Months)', 'Disease Free Status',
       'Disease Free (Months)']

survival_status = patient_df[survival_list]
surv_sta = survival_status.iloc[4:,:1]
surv_length = survival_status.iloc[4:,1:2]
surv_sta.rename(columns={surv_sta.columns[0]: 'surv_status'}, inplace=True)
surv_length.rename(columns={surv_length.columns[0]: 'surv_length'}, inplace=True)

In [2]:
stage_list = ['American Joint Committee on Cancer Tumor Stage Code',
       'Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code',
       'American Joint Committee on Cancer Metastasis Stage Code',
       'Neoplasm Disease Stage American Joint Committee on Cancer Code',]

stage_info = patient_df[stage_list]
overall_stages = stage_info.iloc[4:,3:4]
overall_stages.rename(columns={overall_stages.columns[0]: 'overall_stage'}, inplace=True)
drop_indices = overall_stages[overall_stages['overall_stage'].isin(['[Not Available]', '[Discrepancy]'])].index
overall_stages = overall_stages.drop(index=drop_indices)

In [3]:
stage_mapping = {
    'Stage I': 1,
    'Stage IA': 1,
    'Stage IB': 1,
    'Stage II': 2,
    'Stage IIA': 2,
    'Stage IIB': 2,
    'Stage III': 3,
    'Stage IIIA': 3,
    'Stage IIIB': 3,
    'Stage IIIC': 3,
    'Stage IV': 4,
    'Stage X': 5  
}

overall_stages['overall_stage_simplified'] = overall_stages['overall_stage'].map(stage_mapping)

In [4]:
exp_path = '../cancer_data/2015_bulk_rna_seq.txt'
exp = pd.read_csv(exp_path, sep='\t', index_col=0)
exp.index = [i[:-3] for i in exp.index]

In [5]:
common_index = exp.index.intersection(overall_stages.index)
overall_stages = overall_stages.loc[common_index]
exp = exp.loc[common_index]

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 提取特征矩阵和目标变量
X = exp.values  # 特征矩阵
y = overall_stages['overall_stage_simplified'].values  # 目标变量

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化特征矩阵
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 创建并训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估模型
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[ 2 18  3  0  0]
 [13 71 16  0  0]
 [ 2 23  6  0  0]
 [ 0  2  1  0  0]
 [ 0  3  1  0  0]]

Classification Report:
              precision    recall  f1-score   support

           1       0.12      0.09      0.10        23
           2       0.61      0.71      0.65       100
           3       0.22      0.19      0.21        31
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         4

    accuracy                           0.49       161
   macro avg       0.19      0.20      0.19       161
weighted avg       0.44      0.49      0.46       161



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.neural_network import MLPClassifier

# 创建并训练MLP模型
mlp_model = MLPClassifier(max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# 预测
y_pred_mlp = mlp_model.predict(X_test)

# 评估模型
print("MLP Classifier Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_mlp))
print("\nMLP Classifier Classification Report:")
print(classification_report(y_test, y_pred_mlp))


MLP Classifier Confusion Matrix:
[[ 3 16  4  0  0]
 [13 64 21  2  0]
 [ 5 18  8  0  0]
 [ 0  2  1  0  0]
 [ 1  3  0  0  0]]

MLP Classifier Classification Report:
              precision    recall  f1-score   support

           1       0.14      0.13      0.13        23
           2       0.62      0.64      0.63       100
           3       0.24      0.26      0.25        31
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         4

    accuracy                           0.47       161
   macro avg       0.20      0.21      0.20       161
weighted avg       0.45      0.47      0.46       161



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.ensemble import RandomForestClassifier

# 创建并训练RandomForest模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 预测
y_pred_rf = rf_model.predict(X_test)

# 评估模型
print("Random Forest Classifier Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest Classifier Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Classifier Confusion Matrix:
[[ 1 20  2  0  0]
 [ 0 99  1  0  0]
 [ 0 30  1  0  0]
 [ 0  3  0  0  0]
 [ 0  4  0  0  0]]

Random Forest Classifier Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.04      0.08        23
           2       0.63      0.99      0.77       100
           3       0.25      0.03      0.06        31
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         4

    accuracy                           0.63       161
   macro avg       0.38      0.21      0.18       161
weighted avg       0.59      0.63      0.50       161



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
from sklearn.svm import SVC

# 创建并训练SVC模型
svc_model = SVC(random_state=42)
svc_model.fit(X_train, y_train)

# 预测
y_pred_svc = svc_model.predict(X_test)

# 评估模型
print("SVC Classifier Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svc))
print("\nSVC Classifier Classification Report:")
print(classification_report(y_test, y_pred_svc))


SVC Classifier Confusion Matrix:
[[  0  23   0   0   0]
 [  0 100   0   0   0]
 [  0  31   0   0   0]
 [  0   3   0   0   0]
 [  0   4   0   0   0]]

SVC Classifier Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        23
           2       0.62      1.00      0.77       100
           3       0.00      0.00      0.00        31
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         4

    accuracy                           0.62       161
   macro avg       0.12      0.20      0.15       161
weighted avg       0.39      0.62      0.48       161



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
