### This script digitizes sequences in the proteome and computes cross-correlations with a user-specified kernel

In [1]:
import matplotlib as mpl
# mpl.use('Agg')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['text.usetex'] = False
mpl.rcParams['font.sans-serif'] = 'Arial'
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['image.interpolation'] = 'none'

import os, re
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pprint import pprint
import scipy
import seaborn as sns

from Bio import SeqIO
from scipy.interpolate import UnivariateSpline
import matplotlib.transforms as mtransforms
from matplotlib.patches import FancyBboxPatch

import metapredict as meta

%matplotlib inline

print(meta.__version__)

  from .autonotebook import tqdm as notebook_tqdm


v2.2


In [2]:
fasta_path = Path('uniprot-reviewed_yes+AND+proteome_up000005640.fasta')
output_path = Path('/Users/13kov/Jupyter code/TF-RNA/ARM_search')

if not output_path.is_dir():
    output_path.mkdir()

In [3]:
def calc_tat_corr(seq): 
    # For correlation task, the TAT ARM sequence is: 49-RKKRRQRRR-57 and there may be two critical R’s, R52 and R53. R53 seems to be the most critical.
    # I would suggest creating a kernel that looks something like:
    # [111RR0111] - where the non-critical R/K’s are treated as generic + charges, the critical R’s have to be R’s, and the Q is a neutral amino acid
    # and also try: [1111R0111] - where only R53 is treated as critical

    # Define kernel
    # 1 represents a positive AA, 0  represents a neutral AA
    ker=[1, 1, 1, 1, 1, 0, 1, 1, 1] # For Q=0
#     ker=[1, 1, 1, 1, 1, 1, 1, 1, 1] # For Q=1
    kmer = len(ker)
    
    # For filtering only on disordered regions
    disorder_scores, disorder_regions = calc_disorder(seq)
    disorder_threshold = 0.2 # Value of 0.2 suggested by Salman
    
     # Digitize the sequence based on charge to get a signal
    sig=np.zeros(len(seq)).astype(int)
    
    for j, val in enumerate(seq):
        if ((val=='R' or val=='K') and disorder_scores[j] >= disorder_threshold):
            sig[j]=1
        else:
            sig[j]=0
            
    # Compute the cross-correlation
    # Note that length of xcorr_vect will be 8 plus the length of the signal
    xcorr_vect=scipy.signal.correlate(sig, ker, mode='full', method='direct')
    
    R_sig=np.zeros(len(seq)).astype(int)
    
    for j, val in enumerate(seq):
        if ((val=='R') and disorder_scores[j] >= disorder_threshold):
            R_sig[j]=1
        else:
            R_sig[j]=0
            
    ker_R=[1, 1, 1, 1, 1, 1, 1, 1, 1]
    xcorr_R=scipy.signal.correlate(R_sig, ker_R, mode='full', method='direct')        
    xcorr_R[xcorr_R>=1]=1
    
    filt_vect=xcorr_vect*xcorr_R

    max_corr=np.max(filt_vect)
    sum_score=np.sum(filt_vect)
                
    return filt_vect, max_corr, sum_score

In [4]:
def calc_disorder(seq):
    predictor = meta.predict_disorder_domains(seq)
    
    score = predictor.disorder
    regions = predictor.disordered_domain_boundaries
    
    return score, regions

In [5]:
TF_df = pd.read_csv('TF_list.csv',header=None).T
TF_query_list=TF_df.loc[0].tolist()
TF_full_df = pd.DataFrame()

TF_output = {} # Declare dictionary

for query in TF_query_list:
#     print(query)
    count = 0
    with open(fasta_path, 'r') as file:
        for record in SeqIO.parse(file, "fasta"):
            description = record.description
#             print(description)
            gene_name = description[(description.find("GN=")+3):description.find(" PE")]
#             print(gene_name)
            if query == gene_name:
#                 print(f'Found {query} in {record.id}')  # This helps to see if there are multiple records for the same gene
                ident = record.id
                seq = str(record.seq).replace('U', 'C') # Replace U (selenocysteine) with C (cysteine)
                count += 1
                
                # Make sure this if statement is one level down from the above
                if count > 1:
                    print(f'Found {count} entries for {query}')
                elif count < 1:
                #         raise Exception(f'Could not find entry for {query} in fasta file')
                    print(f'Could not find entry for {query} in fasta file')
                    
                else:
                    try:
                        # TAT correlation
                        xcorr_vect, max_corr, sum_score = calc_tat_corr(seq)
                        TF_output[gene_name]=max_corr # Add max cross-correlation to dictionary

                    except:
                        print(f'Error thrown by {gene_name}')
                                    
# Maximum xcorr
TF_maxcorr_df=pd.DataFrame.from_dict(TF_output,orient='index',columns=['max_corr'])
TF_maxcorr_df.to_csv('TF_maxcorr.csv')
print('TF max correlation file printed')

Found 2 entries for DDIT3
Found 2 entries for ZNF365
Found 2 entries for CUX1
TF max correlation file printed


In [6]:
proteome_output = {} # Declare dictionary for maxcorr

with open(fasta_path, 'r') as file:
    # Get proteome gene list from fasta file
    for record in SeqIO.parse(file, "fasta"):
        description = record.description
        gene_name = description[(description.find("GN=")+3):description.find(" PE")]
        seq = str(record.seq).replace('U', 'C') # Replace U (selenocysteine) with C (cysteine)
          
    # To do: Include a column for the uniprot ID in the record.id  
        
        try:
            xcorr_vect, max_corr, sum_score  = calc_tat_corr(seq)
            proteome_output[gene_name] = max_corr
            
        except:
            print(f'Error thrown by {gene_name}')

# Maximum xcorr
proteome_intscore_df=pd.DataFrame.from_dict(proteome_output,orient='index',columns=['max_corr'])
proteome_intscore_df.to_csv('proteome_maxcorr.csv')
print('Proteome file printed')

Proteome file printed


In [7]:
prot_df = pd.read_csv('proteome_maxcorr.csv',index_col=0)
# RBP_df = pd.read_csv('RBP_list.csv',index_col=0)
TF_df = pd.read_csv('TF_maxcorr.csv',index_col=0)

# Filter TFs out of the whole proteome df
prot_index = prot_df.index
TF_index = TF_df.index
mask1 = ~prot_index.isin(TF_index)
mask2= prot_index.isin(TF_index)
prot_filt = prot_df.loc[mask1]
TF_only=prot_df.loc[mask2]

# Filter TFs out of the RBP df
# RBP_index = RBP_df.index
# mask3 = prot_index.isin(RBP_index)
# RBP_only = prot_df.loc[mask3]

prot_filt.to_csv('proteome_maxcorr_TFfilt.csv')
# TF_only.to_csv('TF_maxcorr.csv')
# RBP_only.to_csv('RBP_intscore.csv')

In [8]:
# # Make figures
# fig=plt.figure(figsize = (8,10))
# ax = plt.axes()
# im=sns.violinplot(data = [TF_df['max_corr'].values,RBP_filt['max_corr'].values,prot_filt['max_corr'].values],palette="light:#4CB391", scale='width', ax=ax)
# im.set_xticklabels(['TFs','RBPs (-TFs)','Proteome (-TFs)'],fontsize=15)
# im.set_ylabel("Max Cross-Correlation", fontsize = 15)
# ax.grid(False)
# ax.set_facecolor('white')
# ax.patch.set_edgecolor('black')  
# ax.patch.set_linewidth('2') 
# ax.figure.savefig('maxcorr_violins.pdf',bbox_inches='tight')
# plt.show()

# fig=plt.figure(figsize = (8,10))
# ax = plt.axes()
# im=sns.boxplot(data = [TF_df['max_corr'].values,RBP_filt['max_corr'].values,prot_filt['max_corr'].values],ax=ax,showfliers=False)
# im.set_xticklabels(['TFs','RBPs (-TFs)','Proteome (-TFs)'],fontsize=15)
# im.set_ylabel("Max Cross-Correlation", fontsize = 15)
# ax.grid(False)
# ax.set_facecolor('white')
# ax.patch.set_edgecolor('black')  
# ax.patch.set_linewidth('2') 
# ax.figure.savefig('maxcorr_boxplots.pdf',bbox_inches='tight')
# plt.show()

# fig=plt.figure(figsize = (8,10))
# ax = plt.axes()
# im=sns.stripplot(data = [TF_df['max_corr'].values,RBP_filt['max_corr'].values,prot_filt['max_corr'].values],ax=ax)
# im.set_xticklabels(['TFs','RBPs (-TFs)','Proteome (-TFs)'],fontsize=15)
# im.set_ylabel("Max Cross-Correlation", fontsize = 15)
# ax.grid(False)
# ax.set_facecolor('white')
# ax.patch.set_edgecolor('black')  
# ax.patch.set_linewidth('2') 
# ax.figure.savefig('maxcorr_stripplots.pdf',bbox_inches='tight')
# plt.show()

In [9]:
# # https://medium.com/analytics-vidhya/love-the-ocean-love-seaborn-2e8737bef728

# df = pd.concat([TF_df,RBP_filt,prot_filt],keys=['TFs', 'RBPs (-TFs)','Proteome (-TFs)']).reset_index()

# sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# # Initialize the FacetGrid object
# n=5 # Number of ridges to plot
# pal = sns.cubehelix_palette(n, rot=-.25, light=.7)
# g = sns.FacetGrid(df, row="level_0", hue="level_0", 
#                   aspect=10, height=1.0, palette=pal)

# # Draw the densities in a few steps
# g.map(sns.kdeplot, "max_corr", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
# g.map(plt.axhline, y=0, lw=2, clip_on=False)

# # Define and use a simple function to label the plot in axes coordinates
# def label(x, color, label):
#     ax = plt.gca()
#     ax.text(0, .2, label, fontweight="bold", color=color,
#             ha="left", va="center", transform=ax.transAxes)
# g.map(label, "max_corr")

# # Set the subplots to overlap
# g.fig.subplots_adjust(hspace=-.55)

# # Remove axes details that don't play well with overlap
# g.set_titles("")
# g.set(yticks=[])
# g.despine(bottom=True, left=True)

# g.savefig('maxcorr_ridgeplots.pdf',bbox_inches='tight')