Goal: Save any interesting quantity to pickle

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Import standard libraries
from importlib import reload
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import adjusted_mutual_info_score
import seaborn as sns
from scipy import sparse
import scipy.stats as ss
import csv
import pandas as pd
import networkx as nx
import community
import communities as com
import pickle
from collections import defaultdict
import operator
from scipy.sparse import csr_matrix
import itertools
import os.path
import math
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_mutual_info_score
from networkx.algorithms.community.kclique import k_clique_communities
import pybedtools
import time
from tqdm import tqdm
import random
import OmicsIntegrator as oi
import gseapy
from gseapy.plot import barplot, dotplot
from ortools.linear_solver import pywraplp
from matplotlib_venn import venn2, venn3
# Custom libraries
import utils as lu
import correlation_clustering as cc
# Reload modules in case of modifications
reload(lu)
reload(cc)

### Directories

In [None]:
# Directory of genome data
dir_genome = '/home/louiscam/projects/gpcr/data/genome_data/'
# Directory of processed HiC
dir_processed_hic = '/home/louiscam/projects/gpcr/save/processed_hic_data_dir/'
# Directory of epigenomic data
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
# Directory of adhesome data
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
# Directory of TF data
dir_htftarget = '/home/louiscam/projects/gpcr/data/tf_data/hTFtarget/'
# Directory of TCR data
dir_tcr = '/home/louiscam/projects/gpcr/data/tcr_data/'
# Directory of proteins of interest
dir_proteins = '/home/louiscam/projects/gpcr/data/protein_families/'
# Saving directory
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'
saving_mtp2 = '/home/louiscam/projects/mtp2/data/adhesome_data/'

In [None]:
# HiC information
cell_type = 'IMR90'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

# Load data

### Load  activity data

In [None]:
# Active/inactive loci
all_active_loci = pickle.load(open(saving_dir+'all_active_loci.pkl', 'rb'))
all_inactive_loci = pickle.load(open(saving_dir+'all_inactive_loci.pkl', 'rb'))
active_adhesome_loci = pickle.load(open(saving_dir+'active_adhesome_loci.pkl', 'rb'))
inactive_adhesome_loci = pickle.load(open(saving_dir+'inactive_adhesome_loci.pkl', 'rb'))

In [None]:
# Active/inactive genes
active_genes = pickle.load(open(saving_dir+'active_genes.pkl','rb'))
inactive_genes = pickle.load(open(saving_dir+'inactive_genes.pkl','rb'))
active_adhesome_genes = pickle.load(open(saving_dir+'active_adhesome_genes.pkl','rb'))
inactive_adhesome_genes = pickle.load(open(saving_dir+'inactive_adhesome_genes.pkl','rb'))

### Load location data

In [None]:
# Gene location in hg19
gene_locations_filename = dir_genome+'chrom_hg19.loc_canonical'
gene_id_filename = dir_genome+'chrom_hg19.name'
df_loc = lu.get_all_gene_locations(gene_locations_filename, gene_id_filename)

### Load adhesome data

In [None]:
# Adhesome data
adhesome_components_filename = dir_adhesome+'components.csv'
df_components = lu.load_adhesome_data(adhesome_components_filename)
adhesome_chr_loci, missing_adhesome = lu.annotate_genes(df_components[['geneSymbol']], df_loc, resol)
adhesome_chr_loci['active'] = adhesome_chr_loci['gene'].isin(active_adhesome_genes)
pickle.dump(adhesome_chr_loci, open(saving_dir+'adhesome_chr_loci.pkl','wb'))
pickle.dump(adhesome_chr_loci, open(saving_mtp2+'adhesome_chr_loci.pkl','wb'))

### Load TF data

In [31]:
# TF data
htargettf_df = pd.read_csv(dir_htftarget+'hTFtarget.txt', sep='\t', header=0)
htargettf_df.columns = ['geneSymbol', 'target', 'tissue']
htargettf_lung_df = htargettf_df[htargettf_df['tissue'].str.contains('lung')]
lungTF_chr_loci, missing_TF = lu.annotate_genes(htargettf_lung_df[['geneSymbol']].drop_duplicates(), df_loc, resol)
lungTF_chr_loci['active'] = lungTF_chr_loci['gene'].isin(active_genes)
len(np.unique(htargettf_lung_df[htargettf_lung_df['target'].isin(active_adhesome_genes)]['geneSymbol'].values))

58

### Load TCR data
- https://www.wikipathways.org/index.php/Pathway:WP69
- http://www.netpath.org/netslim/tcr_pathway.html

In [10]:
# TCR data
tcr_df = pd.read_csv(dir_tcr+'TCR.csv', header=0, index_col=None)
print('Total number of TCR intrinsic and associated components = '+str(len(tcr_df)))

Total number of TCR intrinsic and associated components = 92


### Load genes related to mechanoregulation

In [11]:
# Proteins of interest
proteins_df = pickle.load(open(dir_proteins+'mechanoreg_proteins.pkl', 'rb'))
proteins_chr_loci, missing_prot = lu.annotate_genes(proteins_df, df_loc, resol)
proteins_chr_loci['active'] = proteins_chr_loci['gene'].isin(active_genes)

In [12]:
# Mechanoreg protein to family
prot2fam = {proteins_df.iloc[i,0]: proteins_df.iloc[i,1] for i in range(len(proteins_df))}

# Define useful quantities

### List of blacklisted loci

In [22]:
# Create list of all blacklisted loci
blacklist_dict = pickle.load(open(dir_processed_hic+'blacklist.pickle', 'rb'))
blacklist_list = []
for chrom, loci_set in tqdm(blacklist_dict.items()):
    loci_list = list(loci_set)
    for locus in loci_list:
        blacklist_list.append(f'chr_{chrom}_loc_{locus}')
with open(dir_processed_hic+'blacklist_list.pkl', 'wb') as f:
    pickle.dump(blacklist_list, f)


100%|██████████| 22/22 [00:00<00:00, 30224.27it/s]


### Gene to chromosome dictionary

In [None]:
# Build dictionary matching each gene to its corresponding chromosome
gene2chrom = {g: df_loc[df_loc['geneSymbol']==g]['#chrom'].values[0] 
              for g in df_loc['geneSymbol'].values}
pickle.dump(gene2chrom, open(saving_dir+'gene2chrom.pkl','wb'))

### Adhesome genes/loci

In [18]:
# All adhesome genes
all_adhesome_genes = np.unique(adhesome_chr_loci['gene'])
all_adhesome_loci = np.unique(adhesome_chr_loci['locus_id'])
pickle.dump(all_adhesome_genes, open(saving_dir+'all_adhesome_genes.pkl','wb'))
pickle.dump(all_adhesome_loci, open(saving_dir+'all_adhesome_loci.pkl','wb'))

In [19]:
# Adhesome gene2locus and locus2gene dictionaries
adhesome_gene2locus = {gene: adhesome_chr_loci[adhesome_chr_loci['gene']==gene]['locus_id'].values.astype(str)  
              for gene in np.unique(adhesome_chr_loci['gene'])}
adhesome_locus2gene = {locus: adhesome_chr_loci[adhesome_chr_loci['locus_id']==locus]['gene'].values.astype(str) 
              for locus in np.unique(adhesome_chr_loci['locus_id'])}
pickle.dump(adhesome_gene2locus, open(saving_dir+'adhesome_gene2locus.pkl','wb'))
pickle.dump(adhesome_locus2gene, open(saving_dir+'adhesome_locus2gene.pkl','wb'))

In [20]:
# Active adhesome gene2locus and locus2gene dictionaries
active_adhesome_gene2locus = {gene: adhesome_chr_loci[(adhesome_chr_loci['gene']==gene) &
                                                      (adhesome_chr_loci['active']==True)]['locus_id'].values.astype(str)  
              for gene in np.unique(adhesome_chr_loci['gene'])}
active_adhesome_locus2gene = {locus: adhesome_chr_loci[(adhesome_chr_loci['locus_id']==locus) &
                                                      (adhesome_chr_loci['active']==True)]['gene'].values.astype(str) 
              for locus in np.unique(adhesome_chr_loci['locus_id'])}
pickle.dump(active_adhesome_gene2locus, open(saving_dir+'active_adhesome_gene2locus.pkl','wb'))
pickle.dump(active_adhesome_locus2gene, open(saving_dir+'active_adhesome_locus2gene.pkl','wb'))

### All active lung TF genes/loci

In [21]:
# Merge TF and target information
lungTF_chr_loci1 = lungTF_chr_loci.copy()
lungTF_chr_loci1.columns = ['chrom', 'chrom_int', 'loci', 'geneSymbol', 'locus_id', 'active']
tf_target_df = htargettf_lung_df.merge(lungTF_chr_loci1, on='geneSymbol')
tf_target_df = tf_target_df[['geneSymbol', 'chrom', 'chrom_int', 'loci', 'locus_id', 'active', 'target']]

In [22]:
# Select active TFs targeting active genes
active_tf_target_df = tf_target_df[(tf_target_df['active']) & (tf_target_df['target'].isin(active_genes))]
active_lung_tf2target = {tf: set(np.unique(active_tf_target_df[active_tf_target_df['geneSymbol']==tf]['target'].values))
                        for tf in np.unique(active_tf_target_df['geneSymbol'].values)}
active_lung_target2tf = {target: set(np.unique(active_tf_target_df[active_tf_target_df['target']==target]['geneSymbol'].values))
                        for target in np.unique(active_tf_target_df['target'].values)}
pickle.dump(active_lung_tf2target, open(saving_dir+'active_lung_tf2target.pkl', 'wb'))
pickle.dump(active_lung_target2tf, open(saving_dir+'active_lung_target2tf.pkl', 'wb'))
pickle.dump(active_lung_tf2target, open(saving_mtp2+'active_lung_tf2target.pkl', 'wb'))
pickle.dump(active_lung_target2tf, open(saving_mtp2+'active_lung_target2tf.pkl', 'wb'))

In [23]:
# Store active TFs targeting active active genes
active_lung_tf_genes = np.unique(tf_target_df[tf_target_df['target'].isin(active_genes) &
                                         tf_target_df['active']]
                            ['geneSymbol'])
active_lung_tf_loci = np.unique(tf_target_df[tf_target_df['target'].isin(active_genes) &
                                         tf_target_df['active']]
                            ['locus_id'])
pickle.dump(active_lung_tf_genes, open(saving_dir+'active_lung_tf_genes.pkl','wb'))
pickle.dump(active_lung_tf_loci, open(saving_dir+'active_lung_tf_loci.pkl','wb'))

### Lung TF genes/loci targeting adhesome

In [24]:
# Active TF genes targeting active adhesome
active_lung_adhesome_tf_genes = np.unique(tf_target_df[tf_target_df['target'].isin(active_adhesome_genes) &
                                         tf_target_df['active']]['geneSymbol'])
active_lung_adhesome_tf_loci = np.unique(tf_target_df[tf_target_df['target'].isin(active_adhesome_genes) &
                                         tf_target_df['active']]['locus_id'])
pickle.dump(active_lung_adhesome_tf_genes, open(saving_dir+'active_lung_adhesome_tf_genes.pkl','wb'))
pickle.dump(active_lung_adhesome_tf_loci, open(saving_dir+'active_lung_adhesome_tf_loci.pkl','wb'))

In [25]:
# Dictionary tf2target and target2tf
active_adhesome_tf_target_df = tf_target_df[tf_target_df['geneSymbol'].isin(active_lung_adhesome_tf_genes) &
                                   tf_target_df['target'].isin(active_adhesome_genes)]
adhesome_active_lung_tf2target = {tf: active_adhesome_tf_target_df[active_adhesome_tf_target_df['geneSymbol']==tf]['target'].values 
             for tf in active_lung_adhesome_tf_genes}
adhesome_active_lung_target2tf = {target: active_adhesome_tf_target_df[active_adhesome_tf_target_df['target']==target]['geneSymbol'].values 
             for target in np.unique(active_adhesome_tf_target_df['target'])}
pickle.dump(adhesome_active_lung_tf2target, open(saving_dir+'adhesome_active_lung_tf2target.pkl','wb'))
pickle.dump(adhesome_active_lung_target2tf, open(saving_dir+'adhesome_active_lung_target2tf.pkl','wb'))

### TCR genes/loci

In [26]:
# Select active/inactive TCR genes and loci
tcr_chr_loci, missing_tcr = lu.annotate_genes(tcr_df[['geneSymbol']], df_loc, resol)
tcr_chr_loci['gene_active'] = tcr_chr_loci['gene'].isin(active_genes)
tcr_chr_loci['locus_active'] = tcr_chr_loci['locus_id'].isin(all_active_loci)
tcr_genes = tcr_chr_loci['gene'].unique()
tcr_loci = tcr_chr_loci['locus_id'].unique()
active_tcr_genes = tcr_chr_loci[tcr_chr_loci['gene_active']==True]['gene'].unique()
active_tcr_loci = tcr_chr_loci[tcr_chr_loci['locus_active']==True]['locus_id'].unique()
inactive_tcr_genes = tcr_chr_loci[tcr_chr_loci['gene_active']==False]['gene'].unique()
inactive_tcr_loci = tcr_chr_loci[tcr_chr_loci['locus_active']==False]['locus_id'].unique()
print('TCR genes missing from location data = '+str(missing_tcr))
print('Total number of TCR genes = '+str(len(tcr_genes)))
print('Total number of TCR loci = '+str(len(tcr_loci)))
print('Total number of active TCR genes = '+str(len(active_tcr_genes)))
print('Total number of active TCR loci = '+str(len(active_tcr_loci)))

TCR genes missing from location data = ['WAS', 'IKBKG']
Total number of TCR genes = 90
Total number of TCR loci = 113
Total number of active TCR genes = 62
Total number of active TCR loci = 103


In [51]:
# Save to pickle
pickle.dump(tcr_chr_loci, open(saving_dir+'tcr_chr_loci.pkl', 'wb'))
pickle.dump(tcr_genes, open(saving_dir+'tcr_genes.pkl', 'wb'))
pickle.dump(tcr_loci, open(saving_dir+'tcr_loci.pkl', 'wb'))
pickle.dump(active_tcr_genes, open(saving_dir+'active_tcr_genes.pkl', 'wb'))
pickle.dump(active_tcr_loci, open(saving_dir+'active_tcr_loci.pkl', 'wb'))
pickle.dump(inactive_tcr_genes, open(saving_dir+'inactive_tcr_genes.pkl', 'wb'))
pickle.dump(inactive_tcr_loci, open(saving_dir+'inactive_tcr_loci.pkl', 'wb'))

# Jointly consider active adhesome genes, and active adhesome lung TFs

In [41]:
# Gene and locus dataframe for acive adhesome genes and adhesome TFs
adh_and_tf_chr_loci = pd.concat([adhesome_chr_loci[adhesome_chr_loci['active']],
                                 lungTF_chr_loci[lungTF_chr_loci['gene'].isin(active_lung_adhesome_tf_genes)]], 
                                 axis=0)
pickle.dump(adh_and_tf_chr_loci, open(saving_dir+'adh_and_tf_chr_loci.pkl','wb'))
pickle.dump(adh_and_tf_chr_loci, open(saving_mtp2+'adh_and_tf_chr_loci.pkl','wb'))

In [42]:
# gene2locus and locus2gene dictionaries for adhesome+adhesome TFs
adh_and_tf_gene2locus = {gene: adh_and_tf_chr_loci[adh_and_tf_chr_loci['gene']==gene]['locus_id'].values.astype(str)  
              for gene in np.unique(adh_and_tf_chr_loci['gene'])}
adh_and_tf_locus2gene = {locus: adh_and_tf_chr_loci[adh_and_tf_chr_loci['locus_id']==locus]['gene'].values.astype(str) 
              for locus in np.unique(adh_and_tf_chr_loci['locus_id'])}
pickle.dump(adh_and_tf_gene2locus, open(saving_dir+'adh_and_tf_gene2locus.pkl','wb'))
pickle.dump(adh_and_tf_locus2gene, open(saving_dir+'adh_and_tf_locus2gene.pkl','wb'))

# Jointly consider active adhesome genes, lung TFs and mechanoreg proteins

In [43]:
# Active adhesome genes
act_adhesome_chr_loci = adhesome_chr_loci[adhesome_chr_loci['active']==True]
pickle.dump(act_adhesome_chr_loci, open(saving_dir+'act_adhesome_chr_loci.pkl','wb'))
# Active lung TFs
act_lung_TF_chr_loci = lungTF_chr_loci1[lungTF_chr_loci1['geneSymbol'].isin(active_lung_tf_genes)]
act_lung_TF_chr_loci.columns = act_adhesome_chr_loci.columns
pickle.dump(act_lung_TF_chr_loci, open(saving_dir+'act_lung_TF_chr_loci.pkl','wb'))
# Active mechanoreg proteins
act_proteins_chr_loci = proteins_chr_loci[proteins_chr_loci['active']==True]
pickle.dump(act_proteins_chr_loci, open(saving_dir+'act_proteins_chr_loci.pkl','wb'))

In [44]:
# Joint annotated dataframe
joint_chr_loci = pd.concat([act_adhesome_chr_loci,act_lung_TF_chr_loci,act_proteins_chr_loci], axis=0)
joint_loci = np.unique(joint_chr_loci['locus_id'])
joint_genes = np.unique(joint_chr_loci['gene'])
pickle.dump(joint_chr_loci, open(saving_dir+'joint_chr_loci.pkl','wb'))
pickle.dump(joint_loci, open(saving_dir+'joint_loci.pkl','wb'))
pickle.dump(joint_genes, open(saving_dir+'joint_genes.pkl','wb'))

In [45]:
# gene2locus and locus2gene dictionaries
gene2locus = {gene: joint_chr_loci[joint_chr_loci['gene']==gene]['locus_id'].values.astype(str)  
              for gene in np.unique(joint_chr_loci['gene'])}
locus2gene = {locus: joint_chr_loci[joint_chr_loci['locus_id']==locus]['gene'].values.astype(str) 
              for locus in np.unique(joint_chr_loci['locus_id'])}
pickle.dump(gene2locus, open(saving_dir+'gene2locus.pkl','wb'))
pickle.dump(locus2gene, open(saving_dir+'locus2gene.pkl','wb'))

In [46]:
# Get HiC contact map
contacts_df = pd.DataFrame(0, index=joint_loci, columns=joint_loci)
# Loop over all chromosome pairs and fill gene_contacts_df
chr_list = np.arange(1,23,1)
chr_pairs = list(itertools.combinations(chr_list, 2))
for pair in tqdm(chr_pairs):
    time.sleep(.01)
    chr1, chr2 = pair

    # Select loci on chr1 and chr2
    loci_chr1 = [locus for locus in joint_loci if ('chr_'+str(chr1)+'_') in locus]
    loci_chr2 = [locus for locus in joint_loci if ('chr_'+str(chr2)+'_') in locus]

    # Load HiC data for this chromosome pair
    processed_hic_filename = 'hic_'+'chr'+str(chr1)+'_'+'chr'+str(chr2)+'_norm1_filter3'+'.pkl'
    hic_chpair_df = pickle.load(open(dir_processed_hic+processed_hic_filename, 'rb'))

    # Fill in corresponding submatrix of gene_contacts_df by sum of Hi-C contacts across all gene loci
    locus_id_pairs = itertools.product(loci_chr1,loci_chr2)
    for locus1_id, locus2_id in locus_id_pairs:
        locus1 = int(locus1_id.split('_')[3])
        locus2 = int(locus2_id.split('_')[3])
        contacts_df.loc[locus1_id, locus2_id] = hic_chpair_df.loc[locus1,locus2]
# Symmetrize data frame
contacts_df = contacts_df+contacts_df.T
pickle.dump(contacts_df, open(saving_dir+'contacts_df.pkl','wb'))

100%|██████████| 231/231 [00:17<00:00, 13.07it/s]
