# 3. Alignment threshold sims, Li et al.
___
Dr. Raffael lab <br>
2024

In [1]:
import random
from copy import deepcopy
import multiprocessing as mp

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import norm, normaltest
from Bio import pairwise2
from Bio import Align
from Bio.Align import substitution_matrices as matlist
# from Bio.SubsMat import MatrixInfo as matlist

import matplotlib.pyplot as pl



## Helper functions

In [2]:
def performSim(rec, numSims, gap_open = -10, gap_extend = -0.5):
    """Performs a simulated set of pairwise alignments of a reference record
    (i.e., the selected representative sequence for a given enzyme)
    against a pseudo sequence composed of the reference sequence's
    residues randomly shuffled. These simulated alignments establish
    a distribution of scores constituting alignment to a random sequence,
    which is used to establish a cutoff for "significant" alignment scores
    when the reference is aligned to each other sequence in the fetched 
    set of records.
    
    Arguments:
        (pd.Series) rec: the selected sequence representative of a given enzyme
            and associated metadata
        (int) numSims: number of simultions to perform
        (int|float) gap_open: alignment penalty parameter 1
        (int|float) gap_extend: alignment penalty parameter 2
    
    Returns:
        (pd.DataFrame) a dataframe containing rows (records) of simulation results
            including an index, the reference sequence enzyme name, and a pairwise
            alignment score

    """
    refname = rec['name']
    refSeq = rec.rep_seq

    sefSeqList = list(refSeq)
    shuffledSeqs = [deepcopy(sefSeqList) for i in range(numSims)]
    [random.shuffle(s) for s in shuffledSeqs];
    shuffledSeqsConcat = [''.join(s) for s in shuffledSeqs]

    alnRec = []
    for index, s in enumerate(shuffledSeqsConcat):
        aligner = Align.PairwiseAligner()
        aligner.open_gap_score = -10
        aligner.extend_gap_score = -0.5
        aligner.substitution_matrix = matlist.load('BLOSUM62')
        # oldmethod_score = pairwise2.align.globalds(refSeq, s, 
        #                       matlist.blosum62, -10, -0.5, score_only = True)
        newmethod_score = aligner.score(refSeq, s)
        alnRec.append({'sim_index': index+1, 'name': refname, 'aln_score': newmethod_score})

    alnRecDF = pd.DataFrame(alnRec)
    return alnRecDF

def performSimChunked(refRecords, numSims, gap_open = -10, gap_extend = -0.5):
    """A wrapper to perform simulations in batches to enable progress bar
    while parallelizing.
    
    Arguments:
        (pd.DataFrame) refRecords: a dataframe with each row corresponding to 
            a selected sequence representative for a given enzyme and associated 
            metadata
        (int) numSims: number of simultions to perform
        (int|float) gap_open: alignment penalty parameter 1
        (int|float) gap_extend: alignment penalty parameter 2
    
    Returns:
        (pd.DataFrame) a dataframe containing rows (records) of simulation results
            including an index, the reference sequence enzyme name, and a pairwise
            alignment score

    """
    allresults = []
    
    for i, rec in refRecords.iterrows():
        simResult = performSim(rec, numSims, gap_open = gap_open, gap_extend = gap_extend)
        allresults.append(simResult)
    concatChunk = pd.concat(allresults)
    return pd.concat(allresults)

### 1. Read the representatives summary

In [3]:
colsToKeep = ['name','fullpath', 'rep_desc', 'rep_seq']
aln_references = pd.read_csv('Lietal_BlastReps/200128_Lietal_BlastReps_Summary.csv')[colsToKeep].sort_values('name').reset_index(drop=True)

aln_references.head(5)

Unnamed: 0,name,fullpath,rep_desc,rep_seq
0,steroid_DELTA-isomerase,Lietal_FetchedSeqs/parsed_fastas_cleaned_tempe...,OBI39333.1|1038276474|steroid DELTA-isomerase|...,MPNAAKTEAIKSTVNRYIELVAKGSADDLVELYADDATVEDPVGGE...


### 2. Perform the simulations

In [None]:
# # use a initial state for the RNG
# random.seed(a = 1)

# recordRepo = []

# numchunks = 80
# for recordSet in tqdm(np.array_split(aln_references, numchunks), total = numchunks):

#     numthreads = 24
#     pool = mp.Pool(numthreads)
#     results = []
    
#     numSims = 500
#     result_objects = [pool.apply_async(performSimChunked, args=(chunk,numSims)) 
#                       for chunk in np.array_split(recordSet.reset_index(), numthreads)]

#     try:
#     # result_objects is a list of pool.ApplyResult objects
#         cleaned_DataSets = pd.concat([r.get() for r in result_objects])
#         recordRepo.append(cleaned_DataSets)
#         recordRepo.append(cleaned_DataSets)
#     except:
#         pass  
        
#     pool.close()
#     pool.join()

# recordRepoDF = pd.concat(recordRepo)

In [4]:
recordRepo = []

numSims = 500

try:
    cleaned_DataSets = performSimChunked(aln_references,numSims)
    recordRepo.append(cleaned_DataSets)
    
except:
    pass  

recordRepoDF = pd.concat(recordRepo)

### 3. Describe the simulations, calculate metrics

In [5]:
recordRepoDF

Unnamed: 0,sim_index,name,aln_score
0,1,steroid_DELTA-isomerase,-2.5
1,2,steroid_DELTA-isomerase,-19.5
2,3,steroid_DELTA-isomerase,-8.0
3,4,steroid_DELTA-isomerase,-29.0
4,5,steroid_DELTA-isomerase,-17.5
...,...,...,...
495,496,steroid_DELTA-isomerase,-2.5
496,497,steroid_DELTA-isomerase,-11.5
497,498,steroid_DELTA-isomerase,-12.0
498,499,steroid_DELTA-isomerase,2.0


In [6]:
recordRepoDFAggStats = recordRepoDF[['name', 'aln_score']].groupby('name').agg(['mean', 'median', 'std', normaltest])
recordRepoDFAggStats.columns = recordRepoDFAggStats.columns.get_level_values(1)
recordRepoDFAggStats_reset = recordRepoDFAggStats.reset_index()

recordRepoDFAggStats_reset['normalityTest_pval'] = recordRepoDFAggStats_reset['normaltest'].apply(lambda v: v[1])
recordRepoDFAggStats_reset['normalityTest_statistic'] = recordRepoDFAggStats_reset['normaltest'].apply(lambda v: v[0])
recordRepoDFAggStats_reset['mean+3sd'] = recordRepoDFAggStats_reset['mean'] + 3*recordRepoDFAggStats_reset['std']
recordRepoDFAggStats_reset['median+3sd'] = recordRepoDFAggStats_reset['median'] + 3*recordRepoDFAggStats_reset['std']


recordRepoDFAggStats_reset.rename(columns = {'mean': 'mean_aln_score', 'median': 'median_aln_score', 'std': 'std_aln_score'})
recordRepoDFAggStats_reset.drop(columns = ['normaltest'], inplace = True)

recordRepoDFAggStats_indexed = recordRepoDFAggStats_reset.set_index('name')
recordRepoDFAggStats_indexed.head()

Unnamed: 0_level_0,mean,median,std,normalityTest_pval,normalityTest_statistic,mean+3sd,median+3sd
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
steroid_DELTA-isomerase,-7.297,-8.25,11.460409,6.200654e-08,33.192052,27.084227,27.084227


In [7]:
alnSimResults = aln_references.set_index('name').join(recordRepoDFAggStats_indexed)
alnSimResults.head()

Unnamed: 0_level_0,fullpath,rep_desc,rep_seq,mean,median,std,normalityTest_pval,normalityTest_statistic,mean+3sd,median+3sd
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
steroid_DELTA-isomerase,Lietal_FetchedSeqs/parsed_fastas_cleaned_tempe...,OBI39333.1|1038276474|steroid DELTA-isomerase|...,MPNAAKTEAIKSTVNRYIELVAKGSADDLVELYADDATVEDPVGGE...,-7.297,-8.25,11.460409,6.200654e-08,33.192052,27.084227,27.084227


### 4. Save the results

In [13]:
#When doing for every enzyme, see the original code here commented, use that to compress the file
alnSimResults.to_csv('Lietal_BlastReps/Lietal_200128_RepSummary_AlnSims_Summary.csv')#, compression = 'bz2')


<br>

### 5. For comparison sake, calculate mean and standard deviation of the observed aln scores for each enzyme using an empirically-fit normal distribution (scipy.norm.fit)

In [9]:
def fitNorm(df):
    mu, sigma = norm.fit(df.aln_score.copy().values.tolist())
    return pd.Series({'mean': mu, 'std': sigma}).to_frame().T

recordRepoDFAggStats_fitDist = recordRepoDF[['name', 'aln_score']].groupby('name').apply(fitNorm).reset_index().drop(columns = ['level_1'])
recordRepoDFAggStats_fitDist['mean+3sd'] = recordRepoDFAggStats_fitDist['mean'] + 3*recordRepoDFAggStats_fitDist['std']

recordRepoDFAggStats_fitDist_indexed = recordRepoDFAggStats_fitDist.set_index('name')
recordRepoDFAggStats_fitDist_indexed.head()

Unnamed: 0_level_0,mean,std,mean+3sd
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
steroid_DELTA-isomerase,-7.297,11.448943,27.049828
