In [1]:
# load and summarize the housing dataset
import numpy as np
import pandas as pd
import warnings
import random

random.seed(7)

from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import ElasticNet
from sksurv.linear_model import CoxnetSurvivalAnalysis as CoxPH
from sksurv.metrics import concordance_index_censored
from sklearn.exceptions import FitFailedWarning
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

proteins = pd.read_csv('Data/Processed/Full/full_train.csv', index_col = 'eid')
# summarize shape
print(proteins.shape) #1428 proteins
# summarize first few lines
print(proteins.head())

(28487, 1428)
          AARSD1  ABHD14B    ABL1   ACAA1    ACAN     ACE2   ACOX1    ACP5  \
eid                                                                          
1000041  0.34760  -0.1879 -0.3896  0.1511  0.5124 -0.28310  0.2759 -0.3092   
1000086  0.76670   0.1949 -0.5907 -2.0024  0.1910  0.05565 -0.5210 -0.0292   
1000181 -0.43050  -0.3222 -0.3300 -0.5217 -0.4388 -0.33185 -0.3997 -0.4358   
1000345  0.31910   0.5591  0.4769 -0.2783  0.2849  0.16560  0.0844 -0.0938   
1000374 -0.19775  -1.1510 -0.5480  0.6309  0.1449  0.76155  0.1626 -0.3390   

           ACP6   ACTA2  ...   WNT9A    WWP2    XCL1      XG  XPNPEP2  \
eid                      ...                                            
1000041 -0.2001 -0.3444  ... -0.1501  0.1150 -1.1485 -0.1583   1.0802   
1000086 -0.3807 -0.3239  ... -0.1429 -0.4454  0.2656 -0.1033   1.5979   
1000181  0.0848  0.0293  ... -0.0308 -0.0307  0.3940  0.1194  -0.8755   
1000345  0.0633 -0.0207  ...  0.0618  0.3204 -0.3574 -0.5261  -0.3112   
1

In [2]:
trainset_allprot = proteins
mort = pd.read_csv("Data/Processed/Full/mort_full_train.csv", index_col = 'eid')
mort = mort.reindex(trainset_allprot.index)
trainset_allprot_sex= pd.merge(trainset_allprot, mort[['sex']], on = "eid")

print(mort.head())
print(mort.shape)
mort = mort.drop(columns= 'Unnamed: 0')
print(trainset_allprot.head())

mort_male = mort.loc[mort['sex'] == 1]
mort_female = mort.loc[mort['sex'] == 0]

trainset_male = trainset_allprot.loc[trainset_allprot.index.isin(mort_male.index)] 
trainset_female = trainset_allprot.loc[trainset_allprot.index.isin(mort_female.index)] 

         Unnamed: 0  died  censorage  sex
eid                                      
1000041           3     0  77.678987  1.0
1000086           7     0  60.144422  1.0
1000181          17     0  68.170431  0.0
1000345          33     0  59.466804  1.0
1000374          36     0  64.004791  0.0
(28487, 4)
          AARSD1  ABHD14B    ABL1   ACAA1    ACAN     ACE2   ACOX1    ACP5  \
eid                                                                          
1000041  0.34760  -0.1879 -0.3896  0.1511  0.5124 -0.28310  0.2759 -0.3092   
1000086  0.76670   0.1949 -0.5907 -2.0024  0.1910  0.05565 -0.5210 -0.0292   
1000181 -0.43050  -0.3222 -0.3300 -0.5217 -0.4388 -0.33185 -0.3997 -0.4358   
1000345  0.31910   0.5591  0.4769 -0.2783  0.2849  0.16560  0.0844 -0.0938   
1000374 -0.19775  -1.1510 -0.5480  0.6309  0.1449  0.76155  0.1626 -0.3390   

           ACP6   ACTA2  ...   WNT9A    WWP2    XCL1      XG  XPNPEP2  \
eid                      ...                                            
10

In [None]:
 X = trainset_female
 y_tmp = [(e1, e2) for e1, e2 in mort_female[['died', 'censorage']].values]
 y = np.array(y_tmp, dtype = [("Status", "?"), ("Survival_in_days", "<f8")])

# define model
#model = CoxPH()
#alphas_all =  np.logspace(-6, -1, 100)
#alphas = [np.array([alpha]) for alpha in alphas_all]
l1_ratios = np.arange(0.1, 1, 0.01)
C = 0
best_coefs = pd.DataFrame(0, index=X.columns, columns=["coefficient"])
scores = np.zeros((len(l1_ratios), 2))
 for i, l1_ratio in enumerate(l1_ratios):
     coxnet_pipe = make_pipeline(StandardScaler(), CoxPH(l1_ratio=l1_ratio, alpha_min_ratio=0.01))
     warnings.simplefilter("ignore", UserWarning)
     warnings.simplefilter("ignore", FitFailedWarning)
     coxnet_pipe.fit(X, y)
     estimated_alphas = coxnet_pipe.named_steps["coxnetsurvivalanalysis"].alphas_
     cv = KFold(n_splits=5, shuffle=True, random_state=0)
     gcv = GridSearchCV(
         make_pipeline(StandardScaler(),CoxPH(l1_ratio=l1_ratio)),
         param_grid={"coxnetsurvivalanalysis__alphas": [[v] for v in estimated_alphas]},
         cv=cv,
         error_score=0.5,
         n_jobs=1,).fit(X, y)
     scores[i,0]=gcv.best_params_["coxnetsurvivalanalysis__alphas"][0]
     scores[i,1]=gcv.cv_results_["mean_test_score"][np.where(gcv.cv_results_["rank_test_score"] == 1)[0][0]]
     if scores[i,1] > C:
         C = scores[i,1]
         best_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]
         best_coefs = pd.DataFrame(best_model.coef_, index=X.columns, columns=["coefficient"])

In [None]:
 with open("scores_women2.npy", "wb") as f:
     np.save(f, scores)

In [None]:
best_coefs.to_csv('coefs_w_allprot2.csv')