In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os,sys
import scipy.optimize as op
import pandas as pd 
from tqdm import tqdm

In [3]:
aa_param='''#AA     Mass    Charge  Sigma   Lambda    Lambda_CAL
ALA     71.08   0.00    5.040   0.730   0.2743297969040348
ARG     156.20  1.00    6.560   0.000   0.7307624767517166
ASN     114.10  0.00    5.680   0.432   0.4255859009787713
ASP     115.10  -1.00   5.580   0.378   0.0416040480605567
CYS     103.10  0.00    5.480   0.595   0.5615435099141777
GLN     128.10  0.00    6.020   0.514   0.3934318551056041
GLU     129.10  -1.00   5.920   0.459   0.0006935460962935
GLY     57.05   0.00    4.500   0.649   0.7058843733666401
HIS     137.10  0.50    6.080   0.514   0.4663667290557992
ILE     113.20  0.00    6.180   0.973   0.5423623610671892
LEU     113.20  0.00    6.180   0.973   0.6440005007782226
LYS     128.20  1.00    6.360   0.514   0.1790211738990582
MET     131.20  0.00    6.180   0.838   0.5308481134337497
PHE     147.20  0.00    6.360   1.000   0.8672358982062975
PRO     97.12   0.00    5.560   1.000   0.3593126576364644
SER     87.08   0.00    5.180   0.595   0.4625416811611541
THR     101.10  0.00    5.620   0.676   0.3713162976273964
TRP     186.20  0.00    6.780   0.946   0.9893764740371644
TYR     163.20  0.00    6.460   0.865   0.9774611449343455
VAL     99.07   0.00    5.860   0.892   0.2083769608174481'''

In [4]:
aa={}
for i in aa_param.split('\n'):
	if i[0]!='#':
		name=i.rsplit()[0]
		other=np.array(i.rsplit()[1:],dtype=float)
		aa[name]=other

In [5]:
seq1to3={'R':'ARG','H':'HIS','K':'LYS','D':'ASP','E':'GLU',
     'S':'SER','T':'THR','N':'ASN','Q':'GLN','C':'CYS',
     'U':'SEC','G':'GLY','P':'PRO','A':'ALA','V':'VAL',
     'I':'ILE','L':'LEU','M':'MET','F':'PHE','Y':'TYR',
     'W':'TRP'}

In [6]:
def seq2para(seq):
	mass=np.zeros(len(seq))
	charge=np.zeros(len(seq))
	sigma=np.zeros(len(seq))
	l=np.zeros(len(seq))
	for idx,i in enumerate(list(seq)):
		i3=seq1to3[i]
		if i3 in aa.keys():
			mass[idx]=aa[i3][0]
			charge[idx]=aa[i3][1]
			sigma[idx]=aa[i3][2]
			l[idx]=aa[i3][3]
			
		else:
			print(i3)
	return mass,charge,sigma,l

def fcr(charge):
	res=0.
	for i in charge:
		if i != 0:
			res += 1
	return res/len(charge)

def shd(l):
	res=0.
	for idx,i in enumerate(l):
			for jdx,j in enumerate(l):
					if idx<jdx:
							res+=(i+j)*(jdx-idx)**(-1)
	return res/len(l)

In [7]:
# Calculation for IDR dataset
IDR_seqs = np.loadtxt('./seq_data/IDR_seq.txt', dtype='str')

data_IDR = []
for seq in tqdm(IDR_seqs):
    mass, charge, sigma, l = seq2para(seq)
    data_IDR.append([fcr(charge), shd(l)/10.23464191566815]) # the number is the pre-calculated largest shd in all 3 datasets

# Save the result
np.savetxt("./feature_data/IDR_data.txt", data_IDR)
df = pd.DataFrame(np.array(data_IDR))
df.to_csv("./feature_data/IDR_data.csv", header=['Fraction of Charged Amino Acids', 'Sequence Hydrophobic Patterning'], index=False)

100%|██████████| 28058/28058 [01:24<00:00, 330.70it/s]


In [8]:
# Calculation for folded protien dataset
folded_seqs = np.loadtxt('./seq_data/folded_seq.txt', dtype='str')

data_folded = []
for seq in tqdm(folded_seqs):
    mass, charge, sigma, l = seq2para(seq)
    data_folded.append([fcr(charge), shd(l)/10.23464191566815]) # the number is the pre-calculated largest shd in all 3 datasets

# Save the result
np.savetxt("./feature_data/folded_data.txt", data_folded)
df = pd.DataFrame(np.array(data_folded))
df.to_csv("./feature_data/folded_data.csv", header=['Fraction of Charged Amino Acids', 'Sequence Hydrophobic Patterning'], index=False)

100%|██████████| 2360/2360 [00:13<00:00, 175.65it/s]


In [9]:
# Calculation for NTD dataset
NTD_seqs = np.loadtxt('./seq_data/NTD_seq.txt', dtype='str')

data_NTD = []
for seq in tqdm(NTD_seqs):
    mass, charge, sigma, l = seq2para(seq)
    data_NTD.append([fcr(charge), shd(l)/10.23464191566815]) # the number is the pre-calculated largest shd in all 3 datasets

# Save the result
np.savetxt("./feature_data/NTD_data.txt", data_NTD)
df = pd.DataFrame(np.array(data_NTD))
df.to_csv("./feature_data/NTD_data.csv", header=['Fraction of Charged Amino Acids', 'Sequence Hydrophobic Patterning'], index=False)

100%|██████████| 48/48 [00:00<00:00, 413.59it/s]
