In [24]:
import numpy as np
import scipy as scp
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
from Bio import SeqIO
import pandas as pd
import tqdm
import re
from os import listdir
from os.path import isfile, join

* Calculating E and var using this stuff https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2423438/
* This notebook will be without a lot of commentaries. I am sorry for that.

#### old func

In [5]:
# For example we will try to find all needed vectors for 4-mers

import itertools


# Amino acid data

global amino_dict
amino_dict = { 'ALA': 'A', 'ARG': 'R',
                  'ASN': 'N', 'ASP': 'D',
                  'CYS': 'C', 'GLN': 'Q',
                  'GLU': 'E', 'GLY': 'G',
                  'HIS': 'H', 'ILE': 'I',
                  'LEU': 'L', 'LYS': 'K',
                  'MET': 'M', 'PHE': 'F',
                  'PRO': 'P', 'SER': 'S',
                  'THR': 'T', 'TRP': 'W',
                  'TYR': 'Y', 'VAL': 'V' }

list_amino = list(amino_dict.values())

global amino_string

amino_string = ''.join(x for x in list_amino)

del list_amino

amino_string

'ARNDCQEGHILKMFPSTWYV'

In [6]:
# Make all posible combinations
# K-mer = 4

def making_subseq(k):
    '''Making all subsequences using aminoacids'''
    subseq_iter = itertools.product(amino_string, repeat = k)
    subseq_list = list(subseq_iter)
    del subseq_iter
    for i in range(len(subseq_list)):
        tup = subseq_list[i]
        subseq_list[i] = ''.join(tup)
    return subseq_list

In [21]:
def seqio_data(seq_record):
    '''
    working with SeqRecord class
    
    parametrs:
        seq_record: SeqRecord class from Biopython
    
    return protein name and sequence
    '''
    protein = seq_record.name
    seq = str(seq_record.seq)
    
    return protein, seq


In [7]:
def read_fasta_file(path):
    '''
    Reading fasta file
    return list (whole organism) of lists (for each protein) of SeqRecords
    '''
    fasta_test_file = SeqIO.parse(path, 'fasta')
    record_list = list(fasta_test_file)
    org_name = path.replace('../data/fasta/', '').replace('.fasta', '')
    
    return record_list, org_name

In [13]:
def occurrences_list(string, sub):
    '''
    Finding all ocurrances of substring in string using find() method
    variables:
        string - string where to find
        sub - string to find
    '''
    res = []
    start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            res.append(start)
        else:
            return res

In [18]:
def occurrences_count(string, sub):
    '''
    Counting all ocurrances of substring in string using find() method
    variables:
        string - string where to find
        sub - string to find
    '''
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

#### new func

In [2]:
def expected_value(n, k):
    '''
    Calculating expected value for frequency for k-mer (subsequence) in sequnce with the length = n.
    return expected_value:float
    '''
    return (n-k+1)/(20**k)

In [53]:
def jey_t(subseq):
    '''
    Function to find prefix-suffix similarities in subsequence.
    return: 
        list(len=k-1)
    '''
    k = len(subseq)
    J_t = [] #what to return
    for t in range(1, k):
        #print(t, k)
        index = int(k-t)
        if subseq[0:index] == subseq[index:k]:
            J_t.append(1)
        else:
            J_t.append(0)
    return J_t

In [51]:
def std_variance_string(n, subseq, k):
    '''
    Calcualting variance for frequency for k-mer (subsequence) in sequnce with the length = n.
    return:
        variance:float
    '''
    J_t = jey_t(subseq)
    a1 = ((n-k+1)/(20**k))*(1-1/(20**k))
    a2 = (2/(20**(2*k)))*(n-(3/2)*k+1)
    
    a3_0 = 0
    for t in range(1, k):
        a3_0 += (n-k+1-t)*J_t[t-1]/(20**t)
    a3 = a3_0*2/(20**k)
    
    return (np.sqrt(a1-a2+a3))

In [46]:
def normalize_freq(freq, subseq, n, k):
    '''
    Using expected value and std.variance for for frequency for k-mer (subsequence) 
    in sequnce with the length = n. Formula used: (freq-e)/std_var/
    return:
        normalization_freq:float
    '''
    e = expected_value(n, k)
    std_var = std_variance_string(n, subseq, k)
    
    return (freq - e)/std_var

In [47]:
def finding_freq_single_protein(seq, subseq_list):
    
    '''Finding frequnces for subsequences in single protein
    and scaling it with SKlearn StandardScaler()
    return vector_freq'''
    
    n = len(seq)
    k = len(subseq_list[0])
    
    vector_freq_norm = []
      
    for subseq in subseq_list:
        res = normalize_freq(occurrences_count(seq, subseq), subseq, n, k)
        vector_freq_norm.append(res)
    
    return list(vector_freq_norm)

In [54]:
def main_analyzes(path, k_mer_num, part_begin=0, part_end = 1):
    '''
    Lego construction with all functions above
    param:
        part from 0 to 1 (default=1) - part of data to take
    return:
        organism name
        list of proteins
        list of frequency vectors scaled
        '''
    
    # initializing subseqs
    subseq_list = making_subseq(k_mer_num)
    table_columns = ['Organism', 'Protein'] + subseq_list
    proteins_data = pd.DataFrame(columns=table_columns)
    
    # Variables
    organism_name = ''
      
    #reading
    prot_records, organism_name = read_fasta_file(path)
    
    index = 0
    
    #prot_records stuff
    for i in tqdm.tqdm_notebook(range(int(np.floor(len(prot_records)*part_begin)),int(np.floor(len(prot_records)*part_end)))):
        seq_record = prot_records[i]
        prot_name, seq = seqio_data(seq_record)
        freq_vector = finding_freq_single_protein(seq,subseq_list)
        adding_row = []
        adding_row.append(organism_name)
        adding_row.append(prot_name)
        adding_row += freq_vector
        proteins_data.loc[index] = adding_row
        index += 1
    
    if part_end == 1 and part_begin == 0:
        writing_path = 'csv_data_math/' + organism_name + '.csv'
    else:
        writing_path = 'csv_data_math/' + organism_name + '_part_begin_' + str(part_begin) + '_part_end_' + str(part_end) + '.csv'
    
    proteins_data.to_csv(writing_path)      
    
    del prot_records
    
    return 0

In [55]:
# all paths

data_files = [f for f in listdir('../data/fasta') if isfile(join('../data/fasta', f))]
files_path = []
for i in range(len(data_files)):
    files_path.append('../data/fasta/'+ data_files[i])
print(files_path.pop(7))

../data/fasta/human_proteome.fasta


In [56]:
data_files

['CLMD_trachomatis.fasta',
 'CLOS_difficile.fasta',
 'CMV_AD169.fasta',
 'CPBT_jejuni.fasta',
 'EBV_AG876.fasta',
 'HCV_IsoH.fasta',
 'HIV1_HXB2.fasta',
 'human_proteome.fasta',
 'KLEB_pneumoniae.fasta',
 'MYBT_smegmatis.fasta',
 'MYPL_pneumoniae.fasta',
 'MYPL_synoviae.fasta',
 'SHGL_flexneri.fasta',
 'SLML_typhimurium.fasta',
 'YERS_pestis.fasta',
 'YFV_17D.fasta']

In [57]:
for x in files_path:
    main_analyzes(x, 2)

HBox(children=(IntProgress(value=0, max=895), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3762), HTML(value='')))




HBox(children=(IntProgress(value=0, max=190), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1623), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5126), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6602), HTML(value='')))




HBox(children=(IntProgress(value=0, max=687), HTML(value='')))




HBox(children=(IntProgress(value=0, max=679), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4103), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4533), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3909), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [58]:
main_analyzes('../data/fasta/human_proteome.fasta', 2, part_begin=0, part_end=0.3)

HBox(children=(IntProgress(value=0, max=22304), HTML(value='')))




0

In [59]:
main_analyzes('../data/fasta/human_proteome.fasta', 2, part_begin=0.3, part_end=0.6)

HBox(children=(IntProgress(value=0, max=22305), HTML(value='')))




0

In [60]:
main_analyzes('../data/fasta/human_proteome.fasta', 2, part_begin=0.6, part_end=1)

HBox(children=(IntProgress(value=0, max=29740), HTML(value='')))




0