In [80]:
# load libraries
import pandas as pd
import numpy as np
from scipy import stats

In [81]:
# specify file paths
train_df = "train.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

In [82]:
print(X.shape)
print(y.shape)

(742, 19920)
(742,)


In [83]:
# initialize dictionary to hold correlation results
corr_dict = {}

# correlate exp of each gene to drug response
for feature in X.columns:
  corr_dict[feature] = X[feature].corr(y)

# convert dictionary to dataframe
correlations = pd.DataFrame.from_dict(corr_dict, orient='index', columns=['Correlation'])
print(correlations.head())

  c /= stddev[:, None]
  c /= stddev[None, :]


         Correlation
A1BG       -0.163252
A1CF       -0.057703
A2M        -0.048846
A2ML1       0.169574
A3GALT2    -0.116942


In [84]:
# set threshold
thres = 0.2

# count number of univariable associations that meet the threshold
num_pred_sensitivity = (correlations['Correlation'] > thres).sum()
num_pred_resistance = (correlations['Correlation'] < -thres).sum()

print('Selected threshold:', thres)
print('Num genes with correlation above threshold:', num_pred_sensitivity)
print('Num genes with correlation below threshold:', num_pred_resistance)
print('Total num genes:', str(num_pred_sensitivity + num_pred_resistance))

Selected threshold: 0.2
Num genes with correlation above threshold: 291
Num genes with correlation below threshold: 185
Total num genes: 476


In [85]:
# identify genes that pass selected threshold
genes_keep = correlations[correlations['Correlation'].abs() > thres].index
print(len(genes_keep))

# subset training dataframe to only genes of interest
X_subset = X[genes_keep]
print(X_subset.shape)

476
(742, 476)


In [86]:
# correlate exp of remaining genes
corr_mat = X_subset.corr(method='pearson', min_periods=1)
corr_mat.shape

(476, 476)

In [87]:
# set diagonal to 0
np.fill_diagonal(corr_mat.values, 0)

# set correlation threshold and identify correlated pairs
thres = 0.8
corr_pairs = (corr_mat.abs() > thres)
corr_pairs.head()

Unnamed: 0,AC003688.2,AC005324.3,AC005943.1,AC008687.8,AC008758.6,AC011473.4,ACSF2,ADGRF1,ADGRF4,ADRB2,...,ZEB1,ZHX3,ZNF165,ZNF428,ZNF441,ZNF576,ZNF668,ZNF709,ZNF775,ZSCAN26
AC003688.2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AC005324.3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AC005943.1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AC008687.8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AC008758.6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [88]:
# initialize set to store correlated gene indices
correlated_genes = set()

# loop through correlated pairs
for i in range(corr_pairs.shape[0]):
    for j in range(i + 1, corr_pairs.shape[1]):

        # if True (highly correlated)
        if corr_pairs.iloc[i, j]:

            #print(corr_mat.columns[i], corr_mat.columns[j])

            # add one of the genes to the set
            correlated_genes.add(corr_mat.columns[i])

print('Highly correlated genes:', correlated_genes)
print('Num correlated genes:', len(correlated_genes))

Highly correlated genes: {'MT-CYB', 'ARHGAP8', 'MT-ATP6', 'MT-ND1', 'MT-ND5', 'MT-ND2', 'MT-ND4L', 'ARHGEF35', 'MT-CO3', 'KRT14', 'MT-ND3', 'AL591806.4', 'MT-CO1', 'MT-CO2', 'DUOX1', 'MT-ND4', 'S100A8', 'MT-ATP8', 'ANXA8'}
Num correlated genes: 19


In [89]:
print('Original number of genes (columns):', X_subset.shape[1])

# remove correlated genes
X_subset = X_subset.drop(columns=list(correlated_genes))
print('Number of genes (columns) remaining:', X_subset.shape[1])

Original number of genes (columns): 476
Number of genes (columns) remaining: 457


In [None]:
# save file
X_subset.to_csv('../procdata/train_subset.csv', index=False)