In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Logistic Regression

In [8]:
clinical_info = pd.read_csv("clinical_info.csv")
clinical_info['PatientID']=clinical_info['PatientID'].str.slice(start=-3)
clinical_info.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Overall.Stage,gender,two-year.survival
0,4,70,2,1,II,male,dead
1,5,80,4,2,IIIb,male,dead
2,6,73,3,1,IIIa,male,dead
3,7,81,2,2,IIIa,male,dead
4,8,71,2,2,IIIa,male,dead


In [9]:
clinical_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          100 non-null    object
 1   age                100 non-null    int64 
 2   clinical.T.Stage   100 non-null    int64 
 3   Clinical.N.Stage   100 non-null    int64 
 4   Overall.Stage      100 non-null    object
 5   gender             100 non-null    object
 6   two-year.survival  100 non-null    object
dtypes: int64(3), object(4)
memory usage: 5.6+ KB


In [10]:
clinical_info['Overall.Stage'] = clinical_info['Overall.Stage'].map({"I":1, "II":2, "IIIa":3, "IIIb":4})
clinical_info['Overall.Stage'].value_counts().sort_index()

1    17
2    15
3    30
4    38
Name: Overall.Stage, dtype: int64

Overall.Stage는 I, II, IIIa, IIIb로 갈수록 생존율이 낮으므로, 1, 2, 3, 4로 mapping해준다.

In [11]:
clinical_info['gender'] = clinical_info['gender'].map({"male":1, "female":0})
clinical_info['gender'].value_counts().sort_index()

0    24
1    76
Name: gender, dtype: int64

gender도 0, 1로 mapping 해준다

In [12]:
clinical_info['two-year.survival'] = clinical_info['two-year.survival'].map({"dead":1, "survived":0})
clinical_info['two-year.survival'].value_counts()

1    68
0    32
Name: two-year.survival, dtype: int64

목표변수인 two-year.survival은 2년 내 재발유무이다. 따라서 사망(dead)은 1, 생존(survived)은 0으로 mapping한다.

In [13]:
clinical_info.columns

Index(['PatientID', 'age', 'clinical.T.Stage', 'Clinical.N.Stage',
       'Overall.Stage', 'gender', 'two-year.survival'],
      dtype='object')

In [14]:
clinical_info.columns = ['PatientID', 'age', 'clinical_T_Stage', 'Clinical_N_Stage', 'Overall_Stage', 'gender', 'two_year_survival']
clinical_info.columns

Index(['PatientID', 'age', 'clinical_T_Stage', 'Clinical_N_Stage',
       'Overall_Stage', 'gender', 'two_year_survival'],
      dtype='object')

회귀분석 시 변수명에 .이 있으면 안되므로 _로 바꿔준다.

In [15]:
clinical_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          100 non-null    object
 1   age                100 non-null    int64 
 2   clinical_T_Stage   100 non-null    int64 
 3   Clinical_N_Stage   100 non-null    int64 
 4   Overall_Stage      100 non-null    int64 
 5   gender             100 non-null    int64 
 6   two_year_survival  100 non-null    int64 
dtypes: int64(6), object(1)
memory usage: 5.6+ KB


In [16]:
df=pd.read_csv('./label.csv',sep=',')
df=pd.DataFrame(df[['PatientID']])
df['PatientID']=df['PatientID'].str.slice(start=-3)

In [17]:
training_data = df.sample(frac=0.8, random_state=42)
testing_data = df.drop(training_data.index)

In [18]:
clinical_info_train=pd.merge(training_data,clinical_info,on='PatientID')
clinical_info_test=pd.merge(testing_data,clinical_info,on='PatientID')

train 데이터와 test 데이터를 8:2 비율로 분할한다.

In [19]:
clinical_info_train['PatientID'].head()

0    250
1    131
2    200
3    114
4    109
Name: PatientID, dtype: object

In [20]:
clinical_info_test['PatientID'].head()

0    005
1    006
2    030
3    042
4    045
Name: PatientID, dtype: object

## CASE 1) age, clinical.T.Stage, Clinical.N.Stage, gender

In [21]:
clinical_info_case1_train = clinical_info_train.drop("Overall_Stage", axis=1)
clinical_info_case1_test = clinical_info_test.drop("Overall_Stage", axis=1)
clinical_info_case1_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          80 non-null     object
 1   age                80 non-null     int64 
 2   clinical_T_Stage   80 non-null     int64 
 3   Clinical_N_Stage   80 non-null     int64 
 4   gender             80 non-null     int64 
 5   two_year_survival  80 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 4.4+ KB


case 1의 경우 age와 T, N, gender로만 분석할 예정이므로 나머지 설명변수는 분석에서 제외한다.

In [22]:
clinical_info_case1_train.to_csv("clinical_info_case1_train.csv", index=False)
clinical_info_case1_test.to_csv("clinical_info_case1_test.csv", index=False)

흉부영상 CT 데이터와 PatientID를 맞추기 위해 train 데이터와 test 데이터를 따로 저장한다.

In [23]:
log_model = Logit.from_formula("""two_year_survival ~ age + clinical_T_Stage + Clinical_N_Stage + gender""", clinical_info_case1_train)
log_result = log_model.fit()
print(log_result.summary())

Optimization terminated successfully.
         Current function value: 0.594579
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:      two_year_survival   No. Observations:                   80
Model:                          Logit   Df Residuals:                       75
Method:                           MLE   Df Model:                            4
Date:                Tue, 25 Jan 2022   Pseudo R-squ.:                 0.04268
Time:                        00:04:18   Log-Likelihood:                -47.566
converged:                       True   LL-Null:                       -49.687
Covariance Type:            nonrobust   LLR p-value:                    0.3744
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.2599      2.478      0.105      0.916      -4.597       5.117
age        

In [24]:
y_pred = log_result.predict(clinical_info_case1_test[['age','clinical_T_Stage','Clinical_N_Stage','gender']])

y_pred_class = (y_pred > 0.5).astype(int)
y_pred_class.head()

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [25]:
print("Accuracy: {0:.3f}\n".format(accuracy_score(clinical_info_case1_test['two_year_survival'], y_pred_class)))
print("Confusion Matrix: \n{}".format(confusion_matrix(clinical_info_case1_test['two_year_survival'], y_pred_class)))

Accuracy: 0.600

Confusion Matrix: 
[[ 1  6]
 [ 2 11]]


모델의 정분류율은 60.0%이다.

## CASE 2) age, Overall.Stage, gender

In [26]:
clinical_info_case2_train = clinical_info_train.drop(["clinical_T_Stage", "Clinical_N_Stage"], axis=1)
clinical_info_case2_test = clinical_info_test.drop(["clinical_T_Stage", "Clinical_N_Stage"], axis=1)
clinical_info_case2_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 79
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          80 non-null     object
 1   age                80 non-null     int64 
 2   Overall_Stage      80 non-null     int64 
 3   gender             80 non-null     int64 
 4   two_year_survival  80 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 3.8+ KB


case 2의 경우 age, Overall.Stage, gender로만 분석할 예정이므로 나머지 설명변수는 분석에서 제외한다.

In [27]:
clinical_info_case2_train.to_csv("clinical_info_case2_train.csv", index=False)
clinical_info_case2_test.to_csv("clinical_info_case2_test.csv", index=False)

흉부영상 CT 데이터와 PatientID를 맞추기 위해 train 데이터와 test 데이터를 따로 저장한다.

In [28]:
log_model = Logit.from_formula("""two_year_survival ~ age + Overall_Stage + gender""", clinical_info_case2_train)
log_result = log_model.fit()
print(log_result.summary())

Optimization terminated successfully.
         Current function value: 0.596005
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:      two_year_survival   No. Observations:                   80
Model:                          Logit   Df Residuals:                       76
Method:                           MLE   Df Model:                            3
Date:                Tue, 25 Jan 2022   Pseudo R-squ.:                 0.04038
Time:                        00:04:26   Log-Likelihood:                -47.680
converged:                       True   LL-Null:                       -49.687
Covariance Type:            nonrobust   LLR p-value:                    0.2601
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         2.1235      2.624      0.809      0.418      -3.019       7.266
age              -0.

In [29]:
y_pred = log_result.predict(clinical_info_case2_test[['age','Overall_Stage','gender']])

y_pred_class = (y_pred > 0.5).astype(int)
y_pred_class.head()

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [30]:
print("Accuracy: {0:.3f}\n".format(accuracy_score(clinical_info_case2_test['two_year_survival'], y_pred_class)))
print("Confusion Matrix: \n{}".format(confusion_matrix(clinical_info_case2_test['two_year_survival'], y_pred_class)))

Accuracy: 0.650

Confusion Matrix: 
[[ 1  6]
 [ 1 12]]


모델의 정분류율은 65.0%이다.