# 7. Logistic regression post-processing
___
Dr. Raffael lab <br>
2024

In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

## Import the raw results

In [2]:
log_regress_root = Path('Lietal_UncorrectedLogisticRegression')

regression_recs_list = []
for path in tqdm(log_regress_root.iterdir(), desc = 'Parsing logistic regression records'):
    if path.stem.startswith('.'):
        pass
    else:
        try:
            regression_recs_list.append(pd.read_csv(path, index_col = 0))
        except:
            print('Path could not be parsed: {}'.format(path))
            raise RuntimeError
    
regression_recs = pd.concat(regression_recs_list).reset_index(drop = True).sort_values('pval')
regression_recs.head()

Parsing logistic regression records: 1it [00:00, 35.04it/s]


Unnamed: 0,Position,var,estimate,stderr,statistic,pval,type,alpha,convergence,Enzyme,position,residue
2682,68,(Intercept),-14.213309,1.731576,-8.208309,2.243255e-16,uncorrected,,,steroid_DELTA-isomerase,68,C
2683,68,temps,0.415145,0.053114,7.816162,5.445821e-15,uncorrected,,,steroid_DELTA-isomerase,68,C
3126,79,(Intercept),-11.377629,1.500899,-7.580541,3.441159e-14,uncorrected,,,steroid_DELTA-isomerase,79,E
3127,79,temps,0.333903,0.046789,7.13629,9.58835e-13,uncorrected,,,steroid_DELTA-isomerase,79,E
1694,43,(Intercept),-9.337702,1.396343,-6.687257,2.273927e-11,uncorrected,,,steroid_DELTA-isomerase,43,I


In [3]:
print('There are {} regressed positions/residues over {} enzymes'.format(len(regression_recs), len(regression_recs.Enzyme.unique())))

There are 5080 regressed positions/residues over 1 enzymes


In [4]:
# regression_recs.to_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords.csv.bz2')
#For multiple enzymes use above
#for single enzyme use below
regression_recs.to_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords.csv')

### Which positions are significant for the uncorrected regression?

In [5]:
# regression_recs = pd.read_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords.csv.bz2', index_col = 0)
#For multiple enzymes use above
#for single enzyme use below
regression_recs = pd.read_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords.csv', index_col = 0)

In [6]:
regression_recs.head()

Unnamed: 0,Position,var,estimate,stderr,statistic,pval,type,alpha,convergence,Enzyme,position,residue
2682,68,(Intercept),-14.213309,1.731576,-8.208309,2.243255e-16,uncorrected,,,steroid_DELTA-isomerase,68,C
2683,68,temps,0.415145,0.053114,7.816162,5.445821e-15,uncorrected,,,steroid_DELTA-isomerase,68,C
3126,79,(Intercept),-11.377629,1.500899,-7.580541,3.441159e-14,uncorrected,,,steroid_DELTA-isomerase,79,E
3127,79,temps,0.333903,0.046789,7.13629,9.58835e-13,uncorrected,,,steroid_DELTA-isomerase,79,E
1694,43,(Intercept),-9.337702,1.396343,-6.687257,2.273927e-11,uncorrected,,,steroid_DELTA-isomerase,43,I


In [7]:
num_tests = len(regression_recs)/2
significance = 0.05

significance_corrected = significance/num_tests

print('Num tests: {}\nCorrected significance threshold: {}'.format(num_tests, significance_corrected))

Num tests: 2540.0
Corrected significance threshold: 1.9685039370078743e-05


In [8]:
regression_recs['unique_index'] = regression_recs.Enzyme+'|'+regression_recs.position.astype(str)+'|'+regression_recs.residue

In [9]:
regression_recs_subset = regression_recs.loc[(regression_recs['var']=='temps')&(regression_recs['pval']<significance_corrected)]

In [10]:
regression_recs_subset['pval_alpha0.5_corrected'] = significance_corrected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_recs_subset['pval_alpha0.5_corrected'] = significance_corrected


In [11]:
regression_recs_subset.rename(columns = {'position': 'sig_pos', 'residue':'sig_res'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_recs_subset.rename(columns = {'position': 'sig_pos', 'residue':'sig_res'}, inplace = True)


In [12]:
regression_recs_subset.index.name = 'Index'
regression_recs_subset.head()

Unnamed: 0_level_0,Position,var,estimate,stderr,statistic,pval,type,alpha,convergence,Enzyme,sig_pos,sig_res,unique_index,pval_alpha0.5_corrected
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2683,68,temps,0.415145,0.053114,7.816162,5.445821e-15,uncorrected,,,steroid_DELTA-isomerase,68,C,steroid_DELTA-isomerase|68|C,2e-05
3127,79,temps,0.333903,0.046789,7.13629,9.58835e-13,uncorrected,,,steroid_DELTA-isomerase,79,E,steroid_DELTA-isomerase|79|E,2e-05
1695,43,temps,0.264811,0.043309,6.114515,9.685092e-10,uncorrected,,,steroid_DELTA-isomerase,43,I,steroid_DELTA-isomerase|43|I,2e-05
1535,39,temps,0.275036,0.046227,5.949673,2.686781e-09,uncorrected,,,steroid_DELTA-isomerase,39,I,steroid_DELTA-isomerase|39|I,2e-05
2361,60,temps,0.235434,0.040637,5.793529,6.89224e-09,uncorrected,,,steroid_DELTA-isomerase,60,A,steroid_DELTA-isomerase|60|A,2e-05


In [13]:
# regression_recs_subset.to_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords_SignificantOnly.csv.bz2')
#For multiple enzymes use above
#for single enzyme use below
regression_recs_subset.to_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords_SignificantOnly.csv')

In [14]:
print('Sites meeting corrected significance threshold: {}'.format(len(regression_recs_subset)))

Sites meeting corrected significance threshold: 34


## Generate a clean representation for the significant sites

In [15]:
#regression_recs_subset = pd.read_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords_SignificantOnly.csv.bz2')
#For multiple enzymes use above
#for single enzyme use below
regression_recs_subset = pd.read_csv('Lietal_LogisticRegressionSummaries/2001_UncorrectedRecords_SignificantOnly.csv')

In [16]:
regression_recs_subset_simple = regression_recs_subset[['Enzyme', 'Position', 'sig_res']].rename(columns = {'sig_res': 'Residue'})
regression_recs_subset_simple.head()

Unnamed: 0,Enzyme,Position,Residue
0,steroid_DELTA-isomerase,68,C
1,steroid_DELTA-isomerase,79,E
2,steroid_DELTA-isomerase,43,I
3,steroid_DELTA-isomerase,39,I
4,steroid_DELTA-isomerase,60,A


In [17]:
regression_recs_subset_simple.to_csv('Lietal_LogisticRegressionSummaries/2001_VanillaSignificant_sites.csv')