In [1]:
import numpy as np
import scipy as scp
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
from Bio import SeqIO
import pandas as pd
import tqdm
import re
from os import listdir
from os.path import isfile, join

### Method for finding entry indices:

In [2]:
def occurrences_count(string, sub):
    '''
    Counting all ocurrances of substring in string using find() method
    variables:
        string - string where to find
        sub - string to find
    '''
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

In [3]:
def occurrences_re(text, sub):
    '''
    Counting all ocurrances of substring in string using re expressions method
    variables:
        string - string where to find
        sub - string to find
    '''
    return len(re.findall('(?={0})'.format(sub), text))

In [4]:
# Showing overlapping problem

string = 'klasklaskl'
string.count('klaskl')

1

### Data:

In [5]:
# For example we will try to find all needed vectors for 4-mers

import itertools


# Amino acid data

global amino_dict
amino_dict = { 'ALA': 'A', 'ARG': 'R',
                  'ASN': 'N', 'ASP': 'D',
                  'CYS': 'C', 'GLN': 'Q',
                  'GLU': 'E', 'GLY': 'G',
                  'HIS': 'H', 'ILE': 'I',
                  'LEU': 'L', 'LYS': 'K',
                  'MET': 'M', 'PHE': 'F',
                  'PRO': 'P', 'SER': 'S',
                  'THR': 'T', 'TRP': 'W',
                  'TYR': 'Y', 'VAL': 'V' }

list_amino = list(amino_dict.values())

global amino_string

amino_string = ''.join(x for x in list_amino)

del list_amino

amino_string

'ARNDCQEGHILKMFPSTWYV'

In [6]:
# Make all posible combinations
# K-mer = 4

def making_subseq(k):
    '''Making all subsequences using aminoacids'''
    subseq_iter = itertools.product(amino_string, repeat = k)
    subseq_list = list(subseq_iter)
    del subseq_iter
    for i in range(len(subseq_list)):
        tup = subseq_list[i]
        subseq_list[i] = ''.join(tup)
    return subseq_list

In [18]:
def read_fasta_file(path):
    '''
    Reading fasta file
    return list (whole organism) of lists (for each protein) of SeqRecords
    '''
    fasta_test_file = SeqIO.parse(path, 'fasta')
    record_list = list(fasta_test_file)
    org_name = path.replace('../data/proteomes/', '').replace('.fasta', '')
    
    return record_list, org_name

### Sequence analyzing functions:

In [8]:
def seqio_data(seq_record):
    '''
    working with SeqRecord class
    
    parametrs:
        seq_record: SeqRecord class from Biopython
    
    return protein name and sequence
    '''
    protein = seq_record.name
    seq = str(seq_record.seq)
    
    return protein, seq

#seqio_data(SeqRecord)

In [9]:
# Functions to make index vectors
# May be helpful in future

def vectorizing_single_subseq(string, subseq, n, k_mer_num,
                              knutt_morris:callable):
    '''
    finding subseq's entry index vector for subseq for some string
    return vector
    '''
    
    #n = len(string)
    #m = len(subseq)
    
    vector = [0 for k in range(n-k_mer_num+1)]
    
    list_entry = knutt_morris(string, subseq)
    
    for x in list_entry:
        vector[x] = 1
        
    return np.array(vector)

def vectorizing_all_subseq(string, subseq_list,
                           vectorizing_single_subseq:callable):
    '''
    finding subseq's entry index vector for all subseqs from subseq_list
    !!! return dict: subseq -> vector
    '''
    n = len(string)
    k_mer_num = len(subseq_list[0])
    
    vector_list = []
    #matrix = []
    
    for subseq in subseq_list:
        vector = vectorizing_single_subseq(string, subseq, n, k_mer_num, knutt_morris)
        vector = vector.reshape((-1, 1))
        del vector
        vector_list.append(vector)
    
    return dict(zip(subseq_list, vector_list))

In [10]:
# Freq

def finding_freq_single_protein(seq, subseq_list):
    
    '''Finding frequnces for subsequences in single protein
    and scaling it with SKlearn StandardScaler()
    return vector_freq'''
    n = len(seq)
    k = len(subseq_list[0])
    
    vector_freq = []
    
    for x in subseq_list:
        vector_freq.append(float(occurrences_count(seq, x))/n)
    
    vector_freq = np.array(vector_freq)
    vector_freq = vector_freq.reshape((-1, 1))
    scaler = preprocessing.StandardScaler()
    vector_freq_scaled = scaler.fit_transform(vector_freq)
    del vector_freq
    
    return list(vector_freq_scaled)

### Main:

In [19]:
def main_analyzes(path, k_mer_num):
    '''
    Lego construction with all functions above
    return:
        organism name
        list of proteins
        list of frequency vectors scaled
        '''
    
    # initializing subseqs
    subseq_list = making_subseq(k_mer_num)
    table_columns = ['Organism', 'Protein'] + subseq_list
    proteins_data = pd.DataFrame(columns=table_columns)
    
    # Variables
    organism_name = ''
      
    #reading
    prot_records, organism_name = read_fasta_file(path)
    
    index = 0
    
    #prot_records stuff
    for i in tqdm.tqdm_notebook(range(len(prot_records))):
        seq_record = prot_records[i]
        prot_name, seq = seqio_data(seq_record)
        freq_vector = finding_freq_single_protein(seq,subseq_list)
        adding_row = []
        adding_row.append(organism_name)
        adding_row.append(prot_name)
        adding_row += freq_vector
        proteins_data.loc[index] = adding_row
        index += 1
    writing_path = 'csv_data/' + organism_name + '.csv'
    
    proteins_data.to_csv(writing_path)      
    
    del prot_records
    
    return 0

In [27]:
# all paths

data_files = [f for f in listdir('../data/proteomes') if isfile(join('../data/proteomes', f))]
files_path = []
for i in range(len(data_files)):
    files_path.append('../data/proteomes/'+ data_files[i])
files_path.pop(13)

'../data/proteomes/human_proteome.fasta'

In [28]:
for x in files_path:
    main_analyzes(x, 2)

HBox(children=(IntProgress(value=0, max=3909), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4391), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5126), HTML(value='')))




HBox(children=(IntProgress(value=0, max=679), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6602), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=79), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4103), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9647), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4533), HTML(value='')))




HBox(children=(IntProgress(value=0, max=73), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5563), HTML(value='')))




HBox(children=(IntProgress(value=0, max=74), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=190), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3676), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3762), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=686), HTML(value='')))




HBox(children=(IntProgress(value=0, max=190), HTML(value='')))




HBox(children=(IntProgress(value=0, max=895), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7154), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1623), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6449), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3993), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8038), HTML(value='')))


