# miRNA-target relationships in cancer

In [1]:
import csv
import datetime
import datalab.bigquery as bq
import google.datalab.storage as storage
import io
import logging
import math as m
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import time

## Utils

### Function definitions

In [2]:
def read_file(bucket, filepath, **kwargs):
  uri = bucket.object(filepath).uri
  get_ipython().run_line_magic('gcs', 'read --object ' + uri + ' --variable csv_data')
  return pd.read_csv(io.BytesIO(csv_data), **kwargs)

In [3]:
def write_df_to_csv(df, index_label, csv_filepath):
  df.to_csv('temp.csv', index_label = index_label)
  !gsutil cp 'temp.csv' $csv_filepath

In [18]:
def write_series_to_csv(series, index_label, csv_filepath):
  series.to_csv('temp.csv', index_label = index_label)
  !gsutil cp 'temp.csv' $csv_filepath

### Set up logging

In [4]:
logger = logging.getLogger()

In [5]:
def setup_file_logger(log_file):
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    hdlr = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr) 
    logger.setLevel(logging.INFO)
    
def log(message):
  print(message)
  logger.info(message)

setup_file_logger('mirtar.log')

## Preprocess data

In [6]:
bucket = storage.Bucket('yfl-mirna')

### miRTar data

#### Read miRTar data

In [12]:
mirtar_data = read_file(bucket, 'data/miRTar/miRTarBase_MTI.csv', encoding='ISO-8859-1')

In [13]:
colnames_from_to = { 'miRTarBase ID': 'mirtarbaseID', 'Species (miRNA)': 'miRNA_species', 'Target Gene': 'targetID',
                     'Target Gene (Entrez ID)': 'target_entrezID', 'Species (Target Gene)': 'target_species',
                     'Support Type': 'support_type', 'References (PMID)': 'PMID_references' }
mirtar_data.rename(columns=colnames_from_to, inplace=True)
mirtar_data.set_index('miRNA', inplace=True)
mirtar_data = mirtar_data[mirtar_data.target_species == 'Homo sapiens']

In [14]:
mirtar_data['targetID'].isna().sum()

0

In [15]:
mirtar_data['target_entrezID'].isna().sum()

0

#### Optional, good to know: Identify any inconsistent target IDs in miRTarBase

In [48]:
mirtarbase_target_ID_to_entrezIDs = {}
mirtarbase_target_entrezID_to_IDs = {}

In [None]:
for miRNA, row in mirtar_data.iterrows():
    if row['targetID'] in mirtarbase_target_ID_to_entrezIDs:
        mirtarbase_target_ID_to_entrezIDs[row['targetID']].add(row['target_entrezID'])
    else:
        mirtarbase_target_ID_to_entrezIDs[row['targetID']] = { row['target_entrezID'] }
    if row['target_entrezID'] in mirtarbase_target_entrezID_to_IDs:
        mirtarbase_target_entrezID_to_IDs[row['target_entrezID']].add(row['targetID'])
    else:
        mirtarbase_target_entrezID_to_IDs[row['target_entrezID']] = { row['targetID'] }

In [None]:
{ targetID: entrezID for targetID, entrezID in mirtarbase_target_ID_to_entrezIDs.items() if len(entrezID) > 1 }

In [None]:
{ entrezID: targetID for entrezID, targetID in mirtarbase_target_entrezID_to_IDs.items() if len(targetID) > 1 }

### mRNA data

#### Read sample mRNA expression data

In [11]:
mRNA_data = read_file(bucket, 'data/mRNA/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv', delimiter='\t')

In [12]:
mRNA_data.rename(index=str, columns={'gene_id': 'mRNA'}, inplace=True)
mRNA_data.set_index('mRNA', inplace=True)

In [13]:
mRNA_ID_pairs = mRNA_data.index.map(lambda x: x.split('|'))
mRNA_IDs = mRNA_ID_pairs.map(lambda x: x[0])
mRNA_entrezIDs = mRNA_ID_pairs.map(lambda x: int(x[1]))

In [14]:
mRNA_entrezID_to_IDs = pd.DataFrame(index=mRNA_ID_pairs.map(lambda x: int(x[1])))
mRNA_entrezID_to_IDs['mRNA_data_ID'] = mRNA_data.index
mRNA_entrezID_to_IDs['ID'] = mRNA_IDs

In [15]:
mRNA_ID_to_IDs = pd.DataFrame(index=mRNA_ID_pairs.map(lambda x: x[0]))
mRNA_ID_to_IDs['mRNA_data_ID'] = mRNA_data.index
mRNA_ID_to_IDs['entrezID'] = mRNA_entrezIDs

In [16]:
mRNA_data['entrezID'] = mRNA_entrezIDs
mRNA_data.set_index('entrezID', inplace=True)

#### Transpose mRNA_data to get samples as rows

In [17]:
sample_mRNAs = mRNA_data.T
sample_mRNAs.index = sample_mRNAs.index.map(lambda x: '-'.join(x.split('-')[0:4]))
sample_mRNAs.reset_index(inplace=True)
sample_mRNAs.drop_duplicates(subset='index', keep='first', inplace=True)
sample_mRNAs.set_index('index', inplace=True)

#### Optional, good to know: Check for mRNAs with ambiguous IDs

In [94]:
def add_to_dict(d, key, val):
    if key in d:
        d[key].add(val)
    else:
        d[key] = { val } 

In [95]:
ambiguous_mRNAs = {}

In [96]:
tars_not_in_data = set()

In [None]:
for miRNA, row in mirtar_data.iterrows():
    mRNA_data_entrezIDs = mRNA_entrezID_to_IDs.index.values
    mRNA_data_IDs = mRNA_ID_to_IDs.index.values
    row_IDs = row['targetID'] + '|' + str(row['target_entrezID'])
    if row['target_entrezID'] in mRNA_data_entrezIDs:
        mRNA_entrezID_to_IDs_entry = mRNA_entrezID_to_IDs.loc[row['target_entrezID']]
        if row['targetID'] != mRNA_entrezID_to_IDs_entry.ID:
            add_to_dict(ambiguous_mRNAs, mRNA_entrezID_to_IDs_entry.mRNA_data_ID, row_IDs)
    elif row['targetID'] in mRNA_data_IDs:
        mRNA_ID_to_IDs_entry = mRNA_ID_to_IDs.loc[row['targetID']]
        if row['target_entrezID'] != mRNA_ID_to_IDs_entry.entrezID:
            add_to_dict(ambiguous_mRNAs, mRNA_ID_to_IDs_entry.mRNA_data_ID, row_IDs)
    else:
        tars_not_in_data.add(row['target_entrezID'])

In [None]:
len(tars_not_in_data)

In [None]:
len(ambiguous_mRNAs.values())

In [None]:
len(ambiguous_mRNAs.keys())

### miRNA data

#### Read sample miRNA expression data

In [35]:
miRNA_data = read_file(bucket, 'data/miRNA/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.csv')

In [36]:
miRNA_data.rename(index=str, columns={'Genes': 'miRNA'}, inplace=True)
miRNA_data.set_index('miRNA', inplace=True)
miRNA_data['Corrected'] = (miRNA_data['Correction'] == 'Corrected')
del miRNA_data['Correction']

In [37]:
miRNA_corrections = pd.DataFrame(miRNA_data[['Corrected']])
del miRNA_data['Corrected']

In [38]:
miRNA_data.index.unique().size == miRNA_data.shape[0]

True

#### Transpose miRNA_data to get samples as rows

In [22]:
sample_miRNAs = miRNA_data.T
sample_miRNAs.index = sample_miRNAs.index.map(lambda x: '-'.join(x.split('-')[0:4]))
sample_miRNAs.reset_index(inplace=True)
sample_miRNAs.drop_duplicates(subset='index', keep='first', inplace=True)
sample_miRNAs.set_index('index', inplace=True)

In [23]:
miRNAs_num = len(sample_miRNAs.columns)

### Compute number of targets for miRNAs and write to file

In [45]:
miRNA_target_counts = mirtar_data.groupby('miRNA').size()
miRNA_target_counts = miRNA_target_counts.loc[miRNA_data.index.intersection(miRNA_target_counts.index)]
write_series_to_csv(miRNA_target_counts, miRNA_target_counts.index, 'gs://yfl-mirna/data/miRTar/miRNA-target-counts.csv')

In [26]:
miRNA_target_counts = read_file(bucket, 'data/miRTar/miRNA-target-counts.csv', header=None)
miRNA_target_counts.rename(columns = { 0: 'miRNA', 1: 'targets_count' }, inplace=True)
miRNA_target_counts.set_index('miRNA', inplace=True)

### Sample metadata

In [24]:
sample_metadata = read_file(bucket, 'data/sample/PanCanAtlas_miRNA_sample_information_list.txt', delimiter='\t')

In [25]:
sample_metadata.rename(index=str, columns={'id': 'sample'}, inplace=True)
sample_metadata.set_index('sample', inplace=True)
sample_metadata.index = sample_metadata.index.map(lambda x: '-'.join(x.split('-')[0:4]))
sample_metadata.reset_index(inplace=True)
sample_metadata.drop_duplicates(subset='sample', keep='first', inplace=True)
sample_metadata.set_index('sample', inplace=True)

### Merging mRNA and miRNA expression data and sample metadata

In [29]:
samples = sample_metadata.merge(sample_mRNAs, left_index=True, right_index=True)
samples = samples.merge(sample_miRNAs, left_index=True, right_index=True)

## Definitions

### Data

#### Considering samples of type 1 only

In [32]:
type1_samples = pd.DataFrame(samples[samples.Sample_Type == 1])
type1_sample_mirtars = type1_samples.drop(columns=sample_metadata.columns)
type1_mirtar_data = type1_sample_mirtars.T

In [33]:
type1_sample_disease_mirtars = type1_samples.drop(columns=sample_metadata.columns).applymap(lambda x: m.log(x + 2, 2))
type1_sample_disease_mirtars['Disease'] = type1_samples.Disease

In [29]:
write_df_to_csv(type1_sample_disease_mirtars, 'sample', 'gs://yfl-mirna/data/miRTar/type1-sample_disease_miRNAmRNA-exprs.csv')

Copying file://temp.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\ [1 files][  3.0 GiB/  3.0 GiB]   24.8 MiB/s                                   
Operation completed over 1 objects/3.0 GiB.                                      


#### Log transform

In [36]:
type1_sample_mirtars_log = type1_sample_mirtars.applymap(lambda x: m.log(x + 2, 2))

#### Number of samples for each tumor type

In [34]:
tumor_types_and_counts = type1_samples['Disease'].value_counts().sort_index()
tumor_types = tumor_types_and_counts.index
tumor_type_counts = tumor_types_and_counts.values
tumor_types_num = tumor_types.size

#### Samples grouped by tumor type

In [37]:
sample_mirtars_groupedby_tumor_type = type1_samples.drop(columns=['Sample_Type', 'Protocol', 'Platform']).groupby('Disease')

In [38]:
type1_sample_mirtars_log['Disease'] = type1_samples.Disease
sample_mirtars_log_groupedby_tumor_type = type1_sample_mirtars_log.groupby('Disease')
del type1_sample_mirtars_log['Disease']

## Variation in miRNAs and mRNAs

### Pearson and Spearman correlations

#### Across all samples

In [32]:
mRNA_na_counts = mRNA_data.isnull().sum(axis=1)
mRNAs_nomissing = mRNA_na_counts[mRNA_na_counts == 0].index

In [38]:
mRNA_data.shape

(20531, 11069)

In [34]:
mRNAs_nomissing.size

16335

In [33]:
mirtar_corrs_np = np.corrcoef(type1_sample_mirtars[sample_miRNAs.columns].values, type1_sample_mirtars[mRNAs_nomissing].values, rowvar=False)
mirtar_corrs = pd.DataFrame(mirtar_corrs_np[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
#log('miRTar corrs')
#write_df_to_csv(mirtar_corrs, 'miRNA', 'gs://yfl-mirna/explore/miRTar/pearson-corrs/data/mirtar-corrs.csv')

In [123]:
mirtar_log_corrs_np = np.corrcoef(type1_sample_mirtars_log[sample_miRNAs.columns].values, type1_sample_mirtars_log[mRNAs_nomissing].values, rowvar=False)
mirtar_log_corrs = pd.DataFrame(mirtar_log_corrs_np[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
log('logged miRTar corrs')
write_df_to_csv(mirtar_log_corrs, 'miRNA', 'gs://yfl-mirna/explore/miRTar/pearson-corrs/data/mirtar-log-corrs.csv')

logged miRTar corrs
Copying file://temp.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][238.3 MiB/238.3 MiB]                                                
Operation completed over 1 objects/238.3 MiB.                                    


In [80]:
type1_sample_mirtars_nomissing = type1_sample_mirtars[sample_miRNAs.columns].merge(type1_sample_mirtars[mRNAs_nomissing], left_index=True, right_index=True)

In [114]:
spearman_results = stats.spearmanr(type1_sample_mirtars_nomissing.values)

In [None]:
mirtar_spearman_corrs = pd.DataFrame(spearman_results.correlation[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
mirtar_spearman_corr_pvals = pd.DataFrame(spearman_results.pvalue[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
log('miRTar Spearman corrs')
write_df_to_csv(mirtar_spearman_corrs, 'miRNA', 'gs://yfl-mirna/explore/miRTar/spearman-corrs/data/mirtar-spearman-corrs.csv')
write_df_to_csv(mirtar_spearman_corr_pvals, 'miRNA', 'gs://yfl-mirna/explore/miRTar/spearman-corrs/data/mirtar-spearman-corr-pvals.csv')

#### Within cancer types

In [37]:
cancer_type_mRNA_data = type_samples[sample_mRNAs.columns].T
mRNA_na_counts = cancer_type_mRNA_data.isnull().sum(axis=1)
mRNAs_nomissing = mRNA_na_counts[mRNA_na_counts == 0].index

In [None]:
for cancer_type, type_samples in sample_mirtars_groupedby_tumor_type:
  cancer_type_mRNA_data = type_samples[sample_mRNAs.columns].T
  mRNA_na_counts = cancer_type_mRNA_data.isnull().sum(axis=1)
  mRNAs_nomissing = mRNA_na_counts[mRNA_na_counts == 0].index
  # Pearson correlations
  mirtar_corrs_np = np.corrcoef(type_samples[sample_miRNAs.columns].values, type_samples[mRNAs_nomissing].values, rowvar=False)
  mirtar_corrs = pd.DataFrame(mirtar_corrs_np[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
  log(cancer_type + ' miRTar corrs')
  write_df_to_csv(mirtar_corrs, 'miRNA', 'gs://yfl-mirna/explore/miRTar/pearson-corrs/data/mirtar-corrs_' + cancer_type + '.csv')
  # Spearman correlations
  type1_sample_mirtars_nomissing = type_samples[sample_miRNAs.columns].merge(type_samples[mRNAs_nomissing], left_index=True, right_index=True)
  spearman_results = stats.spearmanr(type1_sample_mirtars_nomissing.values)
  mirtar_spearman_corrs = pd.DataFrame(spearman_results.correlation[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
  mirtar_spearman_corr_pvals = pd.DataFrame(spearman_results.pvalue[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
  log(cancer_type + ' miRTar Spearman corrs')
  write_df_to_csv(mirtar_spearman_corrs, 'miRNA', 'gs://yfl-mirna/explore/miRTar/spearman-corrs/data/' + 'mirtar-spearman-corrs_' + cancer_type + '.csv')
  write_df_to_csv(mirtar_spearman_corr_pvals, 'miRNA', 'gs://yfl-mirna/explore/miRTar/spearman-corrs/data/' + 'mirtar-spearman-corr-pvals_' + cancer_type + '.csv')

#### Within cancer types for Pearson correlations of log-transformed data

In [None]:
for cancer_type, type_samples in sample_mirtars_log_groupedby_tumor_type:
  cancer_type_mRNA_data = type_samples[sample_mRNAs.columns].T
  mRNA_na_counts = cancer_type_mRNA_data.isnull().sum(axis=1)
  mRNAs_nomissing = mRNA_na_counts[mRNA_na_counts == 0].index
  mirtar_log_corrs_np = np.corrcoef(type_samples[sample_miRNAs.columns].values, type_samples[mRNAs_nomissing].values, rowvar=False)
  mirtar_log_corrs = pd.DataFrame(mirtar_log_corrs_np[:miRNAs_num, miRNAs_num:], sample_miRNAs.columns, mRNAs_nomissing)
  log(cancer_type + 'logged miRTar corrs')
  write_df_to_csv(mirtar_log_corrs, 'miRNA', 'gs://yfl-mirna/explore/miRTar/pearson-corrs/data/mirtar-log-corrs_' + cancer_type + '.csv')

  c /= stddev[:, None]
  c /= stddev[None, :]


ACClogged miRTar corrs
Copying file://temp.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\ [1 files][284.8 MiB/284.8 MiB]                                                
Operation completed over 1 objects/284.8 MiB.                                    
BLCAlogged miRTar corrs
Copying file://temp.csv [Content-Type=text/csv]...
==

## Appendix for possible future use

#### Drop mRNAs not in both dataset and miRTarBase

In [26]:
mRNA_mirtar_data_intersection = set(mirtarbase_target_entrezID_to_IDs.keys()).intersection(set(mRNA_entrezIDs.tolist()))
mirtar_data = mirtar_data[mirtar_data.target_entrezID.isin(mRNA_mirtar_data_intersection)]
mRNA_data = mRNA_data[mRNA_data.entrezID.isin(mRNA_mirtar_data_intersection)]

#### Drop miRNAs not in both dataset and miRTarBase

In [34]:
miRNA_mirtar_data_intersection = mirtar_data.index.unique().intersection(miRNA_data.index.unique()).tolist()

In [35]:
len(miRNA_mirtar_data_intersection)

738

In [58]:
mirtar_data = mirtar_data[mirtar_data.index.isin(miRNA_mirtar_data_intersection)]
miRNA_data = miRNA_data[miRNA_data.index.isin(miRNA_mirtar_data_intersection)]