In [1]:
cd ../../

/Users/in-divye.singh/Documents/Projects/MIC_predictor


In [2]:
import os
import time
import ntpath
import datetime
import pandas as pd
from Bio import SeqIO
from scipy import signal
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from tqdm.auto import tqdm

from multiprocessing import Pool

In [12]:
from notebooks.utils import *

In [3]:
RESIDUES = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
            'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Kyte & Doolittle {kd} index of hydrophobicity
HP = {'A': 1.8, 'R':-4.5, 'N':-3.5, 'D':-3.5, 'C': 2.5,
      'Q':-3.5, 'E':-3.5, 'G':-0.4, 'H':-3.2, 'I': 4.5,
      'L': 3.8, 'K':-3.9, 'M': 1.9, 'F': 2.8, 'P':-1.6,
      'S':-0.8, 'T':-0.7, 'W':-0.9, 'Y':-1.3, 'V': 4.2, 'U': 0.0}

In [4]:
def convolve_signal(sig, window=25):
    win = signal.hann(window)
    sig = signal.convolve(sig, win, mode='same') / sum(win)
    return sig


def average(l):
    return sum(l) / len(l)

In [5]:
class HydroPhobicIndex:
    def __init__(self, hpilist):
        self.hpilist = hpilist

In [6]:
def hydrophobic(df):
    for index, row in df.iterrows():
        hpilst = pd.Series(list(row['Seq'])).map(HP).tolist()
        df.loc[index, 'HydroPhobicIndex'] = HydroPhobicIndex(hpilst)
    return df

In [7]:
def add_hydrophobic_features(df):
    df = hydrophobic(df)
    hpi0, hpi1, hpi2, hpi3, hpi4, hpi5 = list(), list(), list(), list(), list(), list() 
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #for index, row in self.df.iterrows():
        sw = convolve_signal(row['HydroPhobicIndex'].hpilist, window=30)
        hpi0.append(sum(i < -1.5 for i in sw) / len(sw))
        # self.df.loc[index, 'hpi_<-1.5_frac'] = hpi
        hpi1.append(sum(i < -2.0 for i in sw) / len(sw))
        # self.df.loc[index, 'hpi_<-2.0_frac'] = hpi
        hpi2.append(sum(i < -2.5 for i in sw) / len(sw))
        # self.df.loc[index, 'hpi_<-2.5_frac'] = hpi
        hpi3.append(sum(i < -1.5 for i in sw))
        # self.df.loc[index, 'hpi_<-1.5'] = hpi
        hpi4.append( sum(i < -2.0 for i in sw))
        # self.df.loc[index, 'hpi_<-2.0'] = hpi
        hpi5.append(sum(i < -2.5 for i in sw))
        # self.df.loc[index, 'hpi_<-2.5'] = hpi 
    df['hpi_<-1.5_frac'] = hpi0
    df['hpi_<-2.0_frac'] = hpi1
    df['hpi_<-2.5_frac'] = hpi2
    df['hpi_<-1.5'] = hpi3
    df['hpi_<-2.0'] = hpi4
    df['hpi_<-2.5'] = hpi5
    
    return df

In [8]:
def amino_acid_analysis(df):
    for res in RESIDUES:
        df['fraction_'+res] = df['Seq'].str.count(res) / df['Seq'].str.len()
    df['length'] = df['Seq'].str.len()
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #for index, row in self.df.iterrows():
        seq = row['Seq']   
        seqanalysis = ProteinAnalysis(seq)
        acidist = seqanalysis.get_amino_acids_percent() 
        df.loc[index, 'IEP'] = seqanalysis.isoelectric_point()
        if 'X' not in seq and 'B' not in seq:
            df.loc[index, 'molecular_weight'] = seqanalysis.molecular_weight()
        if 'U' not in seq and 'X' not in seq and 'B' not in seq:
            df.loc[index, 'gravy'] = seqanalysis.gravy()
    
    return df

In [9]:
def add_biochemical_combinations(df):
    df = amino_acid_analysis(df)
    df = df.assign(Asx=df['fraction_D'] + df['fraction_N'])
    df = df.assign(Glx=df['fraction_E'] + df['fraction_Q'])
    df = df.assign(Xle=df['fraction_I'] + df['fraction_L'])
    df = df.assign(Pos_charge=df['fraction_K'] + df['fraction_R'] + df['fraction_H'])
    df = df.assign(Neg_charge=df['fraction_D'] + df['fraction_E'])
    df = df.assign(Aromatic=df['fraction_F'] + df['fraction_W'] + df['fraction_Y'] + df['fraction_H'])
    df = df.assign(Alipatic=df['fraction_V'] + df['fraction_I'] + df['fraction_L'] + df['fraction_M'])
    df = df.assign(Small=df['fraction_P'] + df['fraction_G'] + df['fraction_A'] + df['fraction_S'])
    df = df.assign(Hydrophilic=(df['fraction_S'] + df['fraction_T'] + df['fraction_H'] + 
                                df['fraction_N'] + df['fraction_Q'] + df['fraction_E'] +
                                df['fraction_D'] + df['fraction_K'] + df['fraction_R']))
    df = df.assign(Hydrophobic= (df['fraction_V'] + df['fraction_I'] + df['fraction_L'] +
                                 df['fraction_F'] + df['fraction_W'] + df['fraction_Y'] +
                                 df['fraction_M']))

    # Added in version 2
    for dimer in ['GV', 'VG', 'VP', 'PG', 'FG', 'RG', 'GR', 'GG', 'YG', 'GS', 'SG', 'GA', 'GF', 'GD', 'DS']:
        df[dimer] = df['Seq'].str.count(dimer)
    df = df.assign(alpha_helix=df['fraction_V'] + df['fraction_I'] + df['fraction_Y'] + df['fraction_F']
                  + df['fraction_W'] + df['fraction_L'])
    df = df.assign(beta_turn=df['fraction_N'] + df['fraction_P'] + df['fraction_G'] + df['fraction_S'])
    df = df.assign(beta_sheet=df['fraction_E'] + df['fraction_M'] + df['fraction_A'] + df['fraction_L'])
    #Calculates the aromaticity value of a protein according to Lobry, 1994. 
    # It is simply the relative frequency of Phe+Trp+Tyr.
    df = df.assign(aromaticity=df['fraction_F'] + df['fraction_W'] + df['fraction_Y'])
    
    return df

In [10]:
def add_lowcomplexity_features(df):
    n_window = 20
    cutoff = 7       
    n_halfwindow = int(n_window / 2)        
    lcs_lowest_complexity = list()
    lcs_scores = list()
    lcs_fractions = list()
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #for index, row in df.iterrows():            
        # Determine low complexity scores
        seq = str(row['Seq'])
        lcs_acids = list()
        sig = list()

        # New
        lc_bool = [False] * len(seq)
        for i in range(len(seq)):
            if i < n_halfwindow:
                peptide = seq[:n_window]        
            elif i+n_halfwindow > int(len(seq)):
                peptide = seq[-n_window:]        
            else:
                peptide = seq[i-n_halfwindow:i+n_halfwindow]       
            complexity = (len(set(peptide)))
            if complexity <= 7:
                for bool_index in (i-n_halfwindow, i+n_halfwindow):
                    try:
                        lc_bool[bool_index] = True
                    except IndexError:
                        pass
                lcs_acids.append(seq[i])
            sig.append(complexity)            
        # Adding low complexity scores to list
        low_complexity_list = pd.DataFrame({'bool':lc_bool, 'acid':list(seq)}, index=None)
        lcs_lowest_complexity.append(min(sig))
        lcs_scores.append(len(low_complexity_list.loc[low_complexity_list['bool'] == True]))
        lcs_fractions.append(len(low_complexity_list.loc[low_complexity_list['bool'] == True]) / len(seq))
        low_complexity_list = pd.DataFrame({'bool':lc_bool, 'acid':list(seq)}, index=None)
        if len(lcs_acids) >= n_window:
            for i in RESIDUES:
                df.loc[index ,i+'_lcscore'] = (len(low_complexity_list.loc[
                    (low_complexity_list['bool'] == True) &
                    (low_complexity_list['acid'] == i)])
                )
                df.loc[index ,i+'_lcfraction'] = (len(low_complexity_list.loc[
                    (low_complexity_list['bool'] == True) & 
                    (low_complexity_list['acid'] == i)]) / len(lcs_acids)
                )
    df['lcs_fractions'] = lcs_fractions
    df['lcs_scores'] = lcs_scores
    df['lcs_lowest_complexity'] = lcs_lowest_complexity
    
    return df

In [13]:
avp_ic50 = pd.read_csv("data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [17]:
df.columns = ['Seq', 'MIC']

In [18]:
df.head(2)

Unnamed: 0,Seq,MIC
0,AAQRRGRVGRNPNQVGD,442.0
1,HRILARIRQMMT,435.5


In [19]:
df = add_hydrophobic_features(df)
df = add_biochemical_combinations(df)
df = add_lowcomplexity_features(df)

100%|██████████| 712/712 [00:00<00:00, 1594.73it/s]
100%|██████████| 712/712 [00:00<00:00, 867.44it/s]
100%|██████████| 712/712 [00:02<00:00, 258.44it/s]


In [20]:
df.columns

Index(['Seq', 'MIC', 'HydroPhobicIndex', 'hpi_<-1.5_frac', 'hpi_<-2.0_frac',
       'hpi_<-2.5_frac', 'hpi_<-1.5', 'hpi_<-2.0', 'hpi_<-2.5', 'fraction_A',
       ...
       'T_lcfraction', 'V_lcscore', 'V_lcfraction', 'W_lcscore',
       'W_lcfraction', 'Y_lcscore', 'Y_lcfraction', 'lcs_fractions',
       'lcs_scores', 'lcs_lowest_complexity'],
      dtype='object', length=105)

In [22]:
import pickle

In [None]:
with open("clean_CDHIT_new_50_v2_features.pkl", "wb") as f:
    pickle.dump(df, f)

In [23]:
df.drop(['HydroPhobicIndex'], axis=1).to_csv("data/raw/105_feature_ha_avp_ic50.csv", index=False)

In [None]:
from Bio import SeqIO

In [None]:
fasta_file = "hrushi/human-predict.fasta"
rows = list()
for record in SeqIO.parse(fasta_file, 'fasta'):
    seqdict = dict()
    seq = str(record.seq)
    id = record.description.split('|')                
    if id[0] == 'sp':
        uniprot_id = id[1]
        rows.append([uniprot_id, seq])
    elif id[0] == 'tr':
        uniprot_id = id[1]
        rows.append([uniprot_id, seq])
    else:
        uniprot_id = id[0]
        rows.append([uniprot_id, seq])                    
db1 = pd.DataFrame(rows, columns=['uniprot_id', 'sequence'])

In [None]:
db1.columns = ['Entry', 'Sequence']
db1.drop_duplicates(['Sequence'], inplace=True)
db1

In [None]:
db2 = pd.read_csv("hrushi/human-predict_prob_wo_properties_best_model.csv")

In [None]:
db2.columns = ['Sequence', 'Probability']
db2.drop_duplicates(['Sequence'], inplace=True)
db2

In [None]:
db3 = pd.read_csv("hrushi/CellReports_Prediction.xlsx - PSAP probabilities for human.csv")

In [None]:
db3.columns = ['Entry', 'UniprotKB_ID', 'PSAP_score']
db3

In [None]:
db4 = db1.merge(db2, how='inner', on='Sequence')

In [None]:
db4

In [None]:
db3.Entry.nunique()

In [None]:
db3 = db3.merge(db4, how='left', on='Entry')

In [None]:
db3.shape

In [None]:
db3

In [None]:
db3.to_csv("hrushi/human-predict_properties_updated.csv", index=False)