In [1]:
import os
os.chdir(r'/root/qbio_490_tritong/analysis_data')
# Load the CCRCC Data
import cptac
# cptac.list_datasets() --> check what dataset available
# cptac.download(dataset="Hnscc")
neck = cptac.Hnscc()

Loading hnscc v2.0..                      



                           

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Protein
protein_data = neck.get_proteomics()
protein_data.columns = protein_data.columns.get_level_values(0)

# RNA
rna_data = neck.get_transcriptomics()
# clinical
clinical_data = neck.get_clinical()

# clinical_data.shape = 192 x 30
# rna_data.shape = 162 x 38456
# protein_data.shape = 178 x 11744

# Find the overlapping patients between rna, clinical, and protein = (151, )
patient_shared = np.intersect1d(np.intersect1d(clinical_data.index, rna_data.index), protein_data.index)

clinical_data = clinical_data.loc[patient_shared, :] # 151 x 30
rna_data = rna_data.loc[patient_shared, :] # 151 x 38456
protein_data = protein_data.loc[patient_shared, :] # 151 x 11744


In [28]:
gene_shared = np.intersect1d(rna_data.columns, protein_data.columns)
rna_shared = rna_data.loc[:,gene_shared] # 151 x 11700
protein_shared = protein_data.loc[:,gene_shared] # 151 x 11700



In [81]:
clinical_data.columns

alc_col = ["alcohol_consum", "num_yrs_alc_con", ]
cig_col = ["num_smoke_per_day", "smoking_history", "smoking_second_hand"]
clinical_data["num_yrs_alc_con"].unique()

array([nan, '30', '32', '31', '20', '5', '15', 'Unknown', '8'],
      dtype=object)

In [91]:
### alcohol_consum
# 'Lifelong non-drinker'
# 'Consumed alcohol in the past, but currently a non-drinker'
# 'Alcohol consumption more than 2 drinks per day for men and more than 1 drink per day for women'
# 'Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women'
#  nan
# 'Alcohol consumption history not available'

drinkers = ['Alcohol consumption more than 2 drinks per day for men and more than 1 drink per day for women',
           'Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women',
           'Consumed alcohol in the past, but currently a non-drinker'] 

(59, 30)

In [69]:
# Find smoking history options
clinical_data["smoking_history"].unique()
# Current reformed smoker, years unknown
# Current smoker: Includes daily and non-daily smokers
# Current reformed smoker within past 15 years
# nan
# Current reformed smoker, more than 15 years
# Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime
# Smoking history not available

In [73]:
# Verify that non_smokers/NA data also do not have num_smoke data or exposure to second hand smoke 
non_smokers = ["nan", "Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime",
              "Smoking history not available"]
non_s = clinical_data[clinical_data["smoking_history"].isin(non_smokers)]
non_s["smoking_second_hand"].unique()

array(['No or minimal exposure to secondhand smoke',
       'Exposure to secondhand smoke history not available'], dtype=object)

In [114]:
# Filter out to find cig_only patients
smokers = ["Current reformed smoker, years unknown",
          "Current smoker: Includes daily and non-daily smokers",
          "Current reformed smoker within past 15 years",
          "Current reformed smoker, more than 15 years"]
# patient can have NA or Unknown in num_smoke_per_day and still be smoker, must combine with history
target_smoker = clinical_data[~pd.isna(clinical_data["num_smoke_per_day"]) | 
                       (clinical_data["smoking_history"].isin(smokers))]
# of the target_smoker, filter out those that also have drinking history
cig_patients = target_smoker[~target_smoker["alcohol_consum"].isin(drinkers)]
# cig_only.shape 13x30
cig_shared_protein = np.intersect1d(protein_shared.index, cig_patients.index)

cig_protein = protein_shared.loc[cig_shared_protein, :]

In [115]:
# Filter out to find alc_only patients
target_drinker = clinical_data[~pd.isna(clinical_data["num_yrs_alc_con"]) |
                          (clinical_data["alcohol_consum"].isin(drinkers))]
alc_patients = target_drinker[~target_drinker["smoking_history"].isin(smokers)]
# alc_only.shape 5x30
alc_shared_protein = np.intersect1d(protein_shared.index, alc_patients.index)

alc_protein = protein_shared.loc[alc_shared_protein, :]

In [117]:
# Filter out for patients that use both
both = target_smoker[target_smoker["alcohol_consum"].isin(drinkers)]
# both.shape 54x30
both_shared_protein = np.intersect1d(protein_shared.index, both.index)

both_protein = protein_shared.loc[both_shared_protein, :]

Name,A1BG,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,AAED1,...,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDA,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00987,28.152905,,29.374443,27.872815,,24.288701,24.731336,24.530248,,,...,,24.575403,22.715326,,,19.438877,22.350913,27.694608,24.194437,17.471477
C3L-00994,28.348186,18.058554,30.252145,25.85458,,24.150865,24.325959,21.295667,,19.525432,...,,24.531751,22.636623,19.005228,,,22.780357,28.117156,24.429272,20.055226
C3L-00995,28.004445,,29.267877,28.182014,,24.292617,24.87866,19.727526,,18.660637,...,,24.719581,22.47525,,,19.361772,22.385058,28.565526,24.713502,
C3L-01138,28.293267,,29.229332,26.794084,,24.687007,25.28584,22.001161,,18.979301,...,,24.663664,23.301144,18.960358,,19.800705,22.347328,27.737254,24.360021,20.123649
C3L-01237,28.216073,,29.100412,26.924027,17.784901,24.152636,24.575984,22.549388,,18.644749,...,,24.586011,22.240152,19.241825,,19.347706,22.792216,28.202721,24.522072,19.868345
C3L-02617,27.452281,19.320859,28.290845,26.369306,,24.521894,24.965669,21.91234,,,...,,25.372397,23.407563,18.341602,,20.793089,22.721036,27.508147,24.483897,19.939257
C3L-02621,27.70104,,28.806067,26.457258,,24.331286,24.750323,22.075945,,,...,,24.806326,22.389692,,,20.392383,22.980954,27.645104,24.772278,22.296522
C3L-02651,27.61614,,29.169265,27.510986,,24.250423,25.134866,19.930743,,18.823073,...,,24.714068,22.889145,,,19.792461,22.477972,27.901461,24.668646,
C3L-03378,27.756237,,28.530226,26.229378,,24.465975,25.217638,22.73788,,,...,,24.775112,22.639795,19.665135,,19.745192,22.552668,27.776796,24.37994,19.865736
C3L-04354,27.754403,,28.416281,25.578132,,24.360845,25.045709,21.652455,,,...,,24.582068,22.349202,,,20.234859,23.023475,27.838958,24.975694,21.605573


In [118]:
# Find top 5 differentially expressed protein - alc-only vs. cig-only
np.abs(alc_protein.mean() - 
       cig_protein.mean()).sort_values(ascending=False)
# GAGE13        6.580280
# DMRTA2        3.777250
# BPIFA2        3.715079
# TNFSF18       3.452488
# HIST2H2AA4    3.361846

Name
GAGE13        6.580280
DMRTA2        3.777250
BPIFA2        3.715079
TNFSF18       3.452488
HIST2H2AA4    3.361846
                ...   
ZRANB3             NaN
ZSCAN25            NaN
ZSCAN31            NaN
ZSWIM9             NaN
ZXDA               NaN
Length: 11700, dtype: float64

In [119]:
# Find top 5 differentially expressed protein - both vs. cig-only
np.abs(both_protein.mean() - 
       cig_protein.mean()).sort_values(ascending=False)
# GAGE13        4.418050
# LCE2B         4.161238
# PHF24         3.351066
# HIST2H2AA4    3.308987
# CLEC18B       2.780023

Name
GAGE13        4.418050
LCE2B         4.161238
PHF24         3.351066
HIST2H2AA4    3.308987
CLEC18B       2.780023
                ...   
ZNF836             NaN
ZNF862             NaN
ZRANB1             NaN
ZSCAN31            NaN
ZXDA               NaN
Length: 11700, dtype: float64

In [120]:
# Find top 5 differentially expressed protein - both vs. alc-only
np.abs(both_protein.mean() - 
       alc_protein.mean()).sort_values(ascending=False)
# AKR1B15    6.425557
# KRT71      4.560080
# BPIFA2     3.400785
# SPINK6     3.245316
# NTS        2.999724

Name
AKR1B15    6.425557
KRT71      4.560080
BPIFA2     3.400785
SPINK6     3.245316
NTS        2.999724
             ...   
ZRANB3          NaN
ZSCAN25         NaN
ZSCAN31         NaN
ZSWIM9          NaN
ZXDA            NaN
Length: 11700, dtype: float64

In [None]:
# combine protein and rna data to predict smoking/alcohol usage?