In [48]:
import pandas as pd

# Load MC3 gene-level mutation data
mc3_file = r"C:\Users\trent\OneDrive\Documents\Big Desktop Files\TCGA\mc3_gene_level_BLCA_mc3_gene_level.txt.gz"
mc3_df = pd.read_csv(mc3_file, sep="\t", compression="gzip")
mc3_df["sample"] = mc3_df["sample"].astype(str).apply(lambda x: x + "_mut")
mc3_df = mc3_df.set_index("sample").T
print("MC3 Gene-Level Mutation Data:")
print("Shape:", mc3_df.shape)
print(mc3_df.head())

# Load survival data
survival_file = r"C:\Users\trent\OneDrive\Documents\Big Desktop Files\TCGA\survival_BLCA_survival.txt"
survival_df = pd.read_csv(survival_file, sep="\t")
survival_df = survival_df.dropna(subset=['OS.time'])
print("\nSurvival Data:")
print("Shape:", survival_df.shape)
print(survival_df.head())

# Load HiSeqV2 sample map
hi_seq_file = r"C:\Users\trent\OneDrive\Documents\Big Desktop Files\TCGA\TCGA.BLCA.sampleMap_HiSeqV2.gz"
hi_seq_df = pd.read_csv(hi_seq_file, sep="\t", compression="gzip")
hi_seq_df["sample"] = hi_seq_df["sample"].apply(lambda x: x + "_exp")
hi_seq_df = hi_seq_df.set_index("sample").T
print("\nHiSeqV2 Sample Map:")
print("Shape:", hi_seq_df.shape)
print(hi_seq_df.head())

# Load HumanMethylation450 sample map
methylation_file = r"C:\Users\trent\OneDrive\Documents\Big Desktop Files\TCGA\TCGA.BLCA.sampleMap_HumanMethylation450.gz"
methylation_df = pd.read_csv(methylation_file, sep="\t", compression="gzip")
methylation_df = methylation_df.set_index("sample").T
methylation_df = methylation_df.dropna(axis=1, how='any')
print("\nHumanMethylation450 Sample Map:")
print("Shape:", methylation_df.shape)
print(methylation_df.head())


MC3 Gene-Level Mutation Data:
Shape: (411, 40543)
sample           UBE2Q2_mut  CHMP1B_mut  PSMA2P1_mut  SHQ1P1_mut  CPHL1P_mut  \
TCGA-2F-A9KO-01           0           0            0           0           0   
TCGA-2F-A9KP-01           0           0            0           0           0   
TCGA-2F-A9KQ-01           0           0            0           0           0   
TCGA-2F-A9KR-01           0           0            0           0           0   
TCGA-2F-A9KT-01           0           0            0           0           0   

sample           SSXP10_mut  REM1_mut  TCOF1_mut  NSRP1_mut  OPA6_mut  ...  \
TCGA-2F-A9KO-01           0         0          0          0         0  ...   
TCGA-2F-A9KP-01           0         0          0          0         0  ...   
TCGA-2F-A9KQ-01           0         0          0          0         0  ...   
TCGA-2F-A9KR-01           0         0          0          0         0  ...   
TCGA-2F-A9KT-01           0         0          0          0         0  ...   



In [49]:
merged_df = mc3_df.merge(hi_seq_df, left_index=True, right_index=True, how="inner").merge(methylation_df, left_index=True, right_index=True, how="inner")

merged_df

sample,UBE2Q2_mut,CHMP1B_mut,PSMA2P1_mut,SHQ1P1_mut,CPHL1P_mut,SSXP10_mut,REM1_mut,TCOF1_mut,NSRP1_mut,OPA6_mut,...,cg19358568,cg27295654,cg03116837,cg15678817,cg14483317,cg11692435,cg10230711,cg16651827,cg18138552,cg07883722
TCGA-2F-A9KO-01,0,0,0,0,0,0,0,0,0,0,...,0.5823,0.8671,0.6482,0.5051,0.0347,0.7364,0.6628,0.7584,0.0273,0.9609
TCGA-2F-A9KP-01,0,0,0,0,0,0,0,0,0,0,...,0.2709,0.5413,0.4315,0.6861,0.0286,0.8113,0.0458,0.5738,0.0279,0.9203
TCGA-2F-A9KQ-01,0,0,0,0,0,0,0,0,0,0,...,0.9243,0.7512,0.2901,0.8303,0.0335,0.1899,0.0489,0.3427,0.0340,0.8079
TCGA-2F-A9KR-01,0,0,0,0,0,0,0,0,0,0,...,0.3893,0.8841,0.7948,0.5876,0.0317,0.9816,0.0456,0.5828,0.0242,0.7690
TCGA-2F-A9KT-01,0,0,0,0,0,0,0,0,0,0,...,0.5010,0.8779,0.3522,0.6134,0.0501,0.9683,0.0469,0.4204,0.0241,0.9373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01,0,0,0,0,0,0,0,0,0,0,...,0.6599,0.8637,0.6874,0.4645,0.0329,0.9430,0.0414,0.6740,0.0336,0.9319
TCGA-ZF-AA58-01,0,0,0,0,0,0,0,0,0,0,...,0.9413,0.8615,0.4693,0.3838,0.0377,0.9856,0.0514,0.5706,0.0273,0.9496
TCGA-ZF-AA5H-01,0,0,0,0,0,0,0,0,0,0,...,0.8848,0.7841,0.3457,0.4516,0.0964,0.9729,0.0396,0.3772,0.0277,0.9238
TCGA-ZF-AA5N-01,0,0,0,0,0,0,0,0,0,0,...,0.9541,0.4782,0.9141,0.6951,0.0456,0.9875,0.0972,0.3394,0.0358,0.8883


In [50]:
# Filter survival_df where OS column equals 1
filtered_survival_df = survival_df[survival_df["OS"] == 1]

# perform inner join with merged_df on sample column
joined_df = merged_df.merge(filtered_survival_df[['sample', 'OS.time']], left_index=True, right_on='sample', how='inner')

# set sample as the index
joined_df.set_index('sample', inplace=True)

# display the resulting dataframe
print(joined_df.head())

                 UBE2Q2_mut  CHMP1B_mut  PSMA2P1_mut  SHQ1P1_mut  CPHL1P_mut  \
sample                                                                         
TCGA-2F-A9KO-01           0           0            0           0           0   
TCGA-2F-A9KP-01           0           0            0           0           0   
TCGA-2F-A9KR-01           0           0            0           0           0   
TCGA-2F-A9KW-01           0           0            0           0           0   
TCGA-4Z-AA7N-01           0           0            0           0           0   

                 SSXP10_mut  REM1_mut  TCOF1_mut  NSRP1_mut  OPA6_mut  ...  \
sample                                                                 ...   
TCGA-2F-A9KO-01           0         0          0          0         0  ...   
TCGA-2F-A9KP-01           0         0          0          0         0  ...   
TCGA-2F-A9KR-01           0         0          0          0         0  ...   
TCGA-2F-A9KW-01           0         0          0 

In [51]:
# import necessary libraries
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(joined_df.drop("OS.time", axis=1), joined_df["OS.time"], test_size=0.2, random_state=42)

# create a Gradient Boosting Regressor model
gbm = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=3, random_state=42, verbose=2)

# train the model on the training set
gbm.fit(X_train, y_train)

# predict on the testing set
y_pred = gbm.predict(X_test)

# calculate the root mean squared error of the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", rmse)


      Iter       Train Loss   Remaining Time 
         1      287380.3012          140.54m
         2      249822.5100          140.96m
         3      214310.3744          140.02m
         4      187643.1873          139.95m
         5      160104.6107          139.49m
         6      137085.6564          139.19m
         7      117883.4233          138.99m
         8      101584.6357          138.65m
         9       88236.6455          138.44m
        10       77495.2479          138.29m
        11       68203.8661          138.03m
        12       60131.3335          137.87m
        13       52629.4212          137.60m
        14       46649.5577          137.41m
        15       40247.4253          137.07m
        16       34908.0950          136.68m
        17       30963.2773          136.62m
        18       27118.0579          136.40m
        19       23327.0225          136.01m
        20       21046.1263          136.03m
        21       18651.0523          135.85m
        2

In [54]:
import lazypredict
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split

X = joined_df.drop('OS.time', axis=1)
y = joined_df['OS.time']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
print(models)


^C


  0%|          | 0/42 [00:00<?, ?it/s]ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\trent\\anaconda3\\envs\\Springboard\\Lib\\site-packages\\scipy\\.libs\\lib_arpack-.UUYM56TM26ZLP3SG3B64BL3XKPFDMZRY.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.



Collecting click==7.1.2
  Using cached click-7.1.2-py2.py3-none-any.whl (82 kB)
Collecting numpy==1.19.1
  Using cached numpy-1.19.1-cp38-cp38-win_amd64.whl (13.0 MB)
Collecting PyYAML==5.3.1
  Using cached PyYAML-5.3.1-cp38-cp38-win_amd64.whl (219 kB)
Collecting pandas==1.0.5
  Using cached pandas-1.0.5-cp38-cp38-win_amd64.whl (8.9 MB)
Collecting scipy==1.5.4
  Using cached scipy-1.5.4-cp38-cp38-win_amd64.whl (31.4 MB)
Collecting six==1.15.0
  Using cached six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting tqdm==4.56.0
  Using cached tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
Collecting joblib==1.0.0
  Using cached joblib-1.0.0-py3-none-any.whl (302 kB)
Installing collected packages: numpy, six, scipy, joblib, tqdm, PyYAML, pandas, click
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.4
    Uninstalling numpy-1.21.4:
      Successfully uninstalled numpy-1.21.4
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Uninstalling six-1.16.0:
 

100%|██████████| 42/42 [3:51:33<00:00, 330.79s/it]   

                                Adjusted R-Squared  \
Model                                                
SGDRegressor               80057891486699503616.00   
Lars                         995427962164160000.00   
PassiveAggressiveRegressor                    1.00   
DecisionTreeRegressor                         1.00   
KernelRidge                                   1.00   
ExtraTreeRegressor                            1.00   
GaussianProcessRegressor                      1.00   
OrthogonalMatchingPursuit                     1.00   
XGBRegressor                                  1.00   
LinearSVR                                     1.00   
LGBMRegressor                                 1.00   
BaggingRegressor                              1.00   
MLPRegressor                                  1.00   
LassoLars                                     1.00   
LassoCV                                       1.00   
Lasso                                         1.00   
ElasticNet                  




In [47]:
survival_df.dropna().shape

sample
TCGA-BT-A20J-01     579.0
TCGA-DK-A1AB-01     508.0
TCGA-XF-A9SP-01     454.0
TCGA-BT-A2LB-01     492.0
TCGA-E7-A97Q-01     246.0
TCGA-BL-A3JM-01     205.0
TCGA-GV-A3QH-01     258.0
TCGA-ZF-AA52-01    1077.0
TCGA-GU-A42P-01     332.0
TCGA-XF-A9ST-01     128.0
TCGA-BT-A20R-01     154.0
TCGA-XF-AAME-01    2828.0
TCGA-CU-A5W6-01      56.0
TCGA-GV-A3QG-01       NaN
TCGA-BL-A13J-01      81.0
TCGA-GU-A42R-01     577.0
TCGA-GU-A767-01     144.0
TCGA-FD-A5BT-01     328.0
TCGA-4Z-AA81-01    1270.0
TCGA-GU-AATO-01     324.0
TCGA-BT-A0YX-01     400.0
TCGA-E7-A97P-01     437.0
TCGA-DK-AA75-01     340.0
TCGA-ZF-A9RN-01     615.0
TCGA-ZF-AA4R-01    1036.0
TCGA-ZF-A9R0-01     680.0
TCGA-GV-A3QF-01     617.0
TCGA-E7-A541-01     778.0
TCGA-FD-A6TD-01     386.0
TCGA-DK-A3WX-01     321.0
TCGA-BT-A20X-01     251.0
TCGA-UY-A78M-01     690.0
TCGA-XF-A9SX-01     719.0
TCGA-BT-A2LD-01     623.0
TCGA-5N-A9KI-01      76.0
TCGA-CU-A72E-01     413.0
Name: OS.time, dtype: float64