### This script digitizes sequences in the proteome and computes cross-correlations with a user-specified kernel

In [1]:
import matplotlib as mpl
# mpl.use('Agg')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['text.usetex'] = False
mpl.rcParams['font.sans-serif'] = 'Arial'
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['image.interpolation'] = 'none'

import os, re
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pprint import pprint
import scipy
import seaborn as sns
import logging
logger = logging.Logger('catch_all')

from Bio import SeqIO
from scipy.interpolate import UnivariateSpline
import matplotlib.transforms as mtransforms
from matplotlib.patches import FancyBboxPatch

import metapredict as meta
# print(meta.__version__)

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fasta_path = Path('uniprot-reviewed_yes+AND+proteome_up000005640.fasta')
output_path = Path('/Users/13kov/Jupyter code/TF-RNA/ARM_search')

if not output_path.is_dir():
    output_path.mkdir()

In [3]:
def calc_tat_corr(seq): 
    # For correlation task, the TAT ARM sequence is: 49-RKKRRQRRR-57 

    # Define kernel
    # 1 represents a positive AA, 0  represents a neutral AA
    ker=[1, 1, 1, 1, 1, 0, 1, 1, 1] # For Q=0
#     ker=[1, 1, 1, 1, 1, 1, 1, 1, 1] # For Q=1
    kmer = len(ker)
    
    # For filtering only on disordered regions
    disorder_scores, disorder_regions = calc_disorder(seq)
    disorder_threshold = 0.2 # Value of 0.2 suggested by Salman
    
    # Digitize the sequence based on charge to get a signal
    sig=np.zeros(len(seq)).astype(int)
    
    for j, val in enumerate(seq):
        if ((val=='R' or val=='K') and disorder_scores[j] >= disorder_threshold):
            sig[j]=1
        else:
            sig[j]=0
            
    # Compute the cross-correlation
    # Note that length of xcorr_vect will be 8 plus the length of the signal
    xcorr_vect=scipy.signal.correlate(sig, ker, mode='full', method='direct')
    
    R_sig=np.zeros(len(seq)).astype(int)
    
    for j, val in enumerate(seq):
        if ((val=='R') and disorder_scores[j] >= disorder_threshold):
            R_sig[j]=1
        else:
            R_sig[j]=0
            
    ker_R=[1, 1, 1, 1, 1, 1, 1, 1, 1]
    xcorr_R=scipy.signal.correlate(R_sig, ker_R, mode='full', method='direct')   
    xcorr_R[xcorr_R>=1]=1
    
    filt_vect=xcorr_vect*xcorr_R

    max_corr=np.max(filt_vect)
    sum_score=np.sum(filt_vect)
                
    return filt_vect, max_corr, sum_score

In [4]:
def print_seqs(filt_vect,seq,gene_name,thresh):
    # BCL11A - example of where blank sequences come up on C-terminus
    # TFAP2A - example of where truncated sequences come up on N-terminus
    out_dict={}
    coord_dict={}
    
    for ind, val in enumerate(filt_vect):
        
        if val>=thresh and ind>=8:
            
            # Need to be very careful with the indexing here to get the right output!
            sub_seq=seq[ind-8:ind+1] 
            
            key=gene_name+'/'+str(ind)
            
            # Get rid of N-terminal tail sequences where the length of the ARM is not 9
            if len(sub_seq)==9: 
                out_dict[key]=sub_seq+'/'+str(val)
                
                # Need to be careful here too
                coord_dict[key]=ind-8

    return out_dict, coord_dict

In [5]:
def calc_disorder(seq):
    predictor = meta.predict_disorder_domains(seq)
    
    score = predictor.disorder
    regions = predictor.disordered_domain_boundaries
    
    return score, regions

In [6]:
# Run TFs

TF_df = pd.read_csv('TF_list.csv',header=None).T
TF_query_list=TF_df.loc[0].tolist()
# TF_query_list = ['BCL11A'] # Use this to debug specific sequences
TF_full_df = pd.DataFrame()

TF_out = {} # Declare dictionary
TF_seq = {} # Declare dictionary
TF_coord = {} # Declare dictionary

for query in TF_query_list:
#     print(query)
    count = 0
    with open(fasta_path, 'r') as file:
        for record in SeqIO.parse(file, "fasta"):
            description = record.description
#             print(description)
            gene_name = description[(description.find("GN=")+3):description.find(" PE")]
#             print(gene_name)
            if query == gene_name:
#                 print(f'Found {query} in {record.id}')  # This helps to see if there are multiple records for the same gene
                ident = record.id
                seq = str(record.seq).replace('U', 'C') # Replace U (selenocysteine) with C (cysteine)
                count += 1
                
                # Make sure this if statement is one level down from the above
                if count > 1:
                    print(f'Found {count} entries for {query}')
                elif count < 1:
                #         raise Exception(f'Could not find entry for {query} in fasta file')
                    print(f'Could not find entry for {query} in fasta file')
                    
                else:
                    try:
                        # TAT correlation
                        xcorr_vect, max_corr, sum_score = calc_tat_corr(seq)
                        TF_out[gene_name]=max_corr # Add max cross-correlation to dictionary

                        # Print sequences
                        thresh=5
                        temp_seq,temp_coord=print_seqs(xcorr_vect,seq,gene_name,thresh)
                        TF_seq.update(temp_seq) # Append temp dict to main
                        TF_coord.update(temp_coord) # Append temp dict to main
                        
                    except BaseException as e:
                        print(f'Error thrown by {gene_name}')
                        logger.error(str(e))
                                    
# Print maximum xcorr
TF_maxcorr_df=pd.DataFrame.from_dict(TF_out,orient='index',columns=['max_corr'])
TF_maxcorr_df.to_csv('TF_maxcorr.csv')
print('TF max correlation file printed')

# Print full x-corr for sequences
TF_seq_out=pd.DataFrame.from_dict(TF_seq,orient='index',columns=['seq'])
TF_coord_out=pd.DataFrame.from_dict(TF_coord,orient='index',columns=['coord'])
TF_seq_out['gene_name']=TF_seq_out.index
TF_seq_out[['seq', 'x_corr']] = TF_seq_out['seq'].str.split('/', expand=True)
TF_seq_out[['gene_name', 'trash']] = TF_seq_out['gene_name'].str.split('/', expand=True)
TF_seq_out['coord_start']=TF_coord_out['coord']
TF_seq_out['coord_end']=TF_coord_out['coord']+9
TF_seq_out.drop('trash',axis=1,inplace=True)
TF_seq_out.set_index('gene_name',drop=True,inplace=True)
TF_seq_out.to_csv('TF_xcorr_strongarms.csv')
print('TF correlation seq file printed')

Found 2 entries for DDIT3
Found 2 entries for ZNF365
Found 2 entries for CUX1
TF max correlation file printed


KeyError: 'gene_name'

In [7]:
# Run proteome

proteome_output = {} # Declare dictionary for maxcorr
proteome_seq={}

with open(fasta_path, 'r') as file:
    # Get proteome gene list from fasta file
    for record in SeqIO.parse(file, "fasta"):
        description = record.description
        gene_name = description[(description.find("GN=")+3):description.find(" PE")]
        seq = str(record.seq).replace('U', 'C') # Replace U (selenocysteine) with C (cysteine)
          
    # To do: Include a column for the uniprot ID in the record.id  
        
        try:
            # Tat correlation
            xcorr_vect, max_corr, sum_score  = calc_tat_corr(seq)
            proteome_output[gene_name] = max_corr
            
            # Print sequences
            thresh=3
            temp=print_seqs(xcorr_vect,seq,gene_name,thresh)
            proteome_seq.update(temp) # Append temp dict to main
            
        except BaseException as e:
            print(f'Error thrown by {gene_name}')
            logger.error(str(e))

# Print maximum xcorr
proteome_intscore_df=pd.DataFrame.from_dict(proteome_output,orient='index',columns=['max_corr'])
proteome_intscore_df.to_csv('proteome_maxcorr.csv')
print('Proteome file printed')

# Print full x-corr for sequences
proteome_seq_out=pd.DataFrame.from_dict(proteome_seq,orient='index',columns=['seq'])
proteome_seq_out['gene_name']=proteome_seq_out.index
proteome_seq_out[['seq', 'x_corr']] = proteome_seq_out['seq'].str.split('/', expand=True)

# Next 2 lines are a hack to get around the '/' character being located in some gene names
proteome_seq_out['trash'] = proteome_seq_out['gene_name'].str.split('/', expand = True)[1]
proteome_seq_out['gene_name'] = proteome_seq_out['gene_name'].str.split('/', expand = True)[0]

proteome_seq_out.drop('trash',axis=1,inplace=True)
proteome_seq_out.set_index('gene_name',drop=True,inplace=True)
proteome_seq_out.to_csv('proteome_xcorr_allseq.csv')
print('Proteome correlation seq file printed')

Proteome file printed
Proteome correlation seq file printed


In [8]:
prot_df = pd.read_csv('proteome_maxcorr.csv',index_col=0)
# RBP_df = pd.read_csv('RBP_list.csv',index_col=0)
TF_df = pd.read_csv('TF_maxcorr.csv',index_col=0)

# Filter TFs out of the whole proteome df
prot_index = prot_df.index
TF_index = TF_df.index
mask1 = ~prot_index.isin(TF_index)
mask2= prot_index.isin(TF_index)
prot_filt = prot_df.loc[mask1]
TF_only=prot_df.loc[mask2]

# Filter TFs out of the RBP df
# RBP_index = RBP_df.index
# mask3 = prot_index.isin(RBP_index)
# RBP_only = prot_df.loc[mask3]

prot_filt.to_csv('proteome_maxcorr_TFfilt.csv')
# TF_only.to_csv('TF_maxcorr.csv')
# RBP_only.to_csv('RBP_intscore.csv')