# Macro pKa calculation

This script is used for the calulation of macro pKa for SM25-46 states based on different data sources.  
First, **SM25** will be used for an example to show the process of the calculation. Also we use energy information from the file: **pKa-ECRISM-1.csv**

In [85]:
import pandas as pd
import numpy as np
import os

In [11]:
data = pd.read_csv('data/pKa-ECRISM-1.csv')
print(data.columns)
data = data[['Molecule Id', 'ID tag ', 'total charge', 'pKa mean']]
data_samp = data[data['Molecule Id'] == 'SM25_micro000']
data_samp

Index(['Molecule Id', 'ID tag ', 'total charge', 'pKa mean', 'pKa SEM',
       'pKa model uncertainty', 'SMILES of extra microstate', 'Unnamed: 7',
       'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'],
      dtype='object')


Unnamed: 0,Molecule Id,ID tag,total charge,pKa mean
0,SM25_micro000,SM25_micro003,-1,7.91
1,SM25_micro000,SM25_micro001,-1,-6.66
2,SM25_micro000,SM25_micro002,0,-7.52
3,SM25_micro000,SM25_micro004,0,-12.08
4,SM25_micro000,SM25_micro005,1,-2.33


In [74]:
list(data['Molecule Id'].unique())

['SM25_micro000',
 'SM26_micro000',
 'SM27_micro000',
 'SM28_micro000',
 'SM29_micro000',
 'SM30_micro000',
 'SM31_micro000',
 'SM32_micro000',
 'SM33_micro000',
 'SM34_micro000',
 'SM35_micro000',
 'SM36_micro000',
 'SM37_micro000',
 'SM38_micro000',
 'SM39_micro000',
 'SM40_micro000',
 'SM41_micro000',
 'SM42_micro000',
 'SM43_micro000',
 'SM44_micro000',
 'SM45_micro000',
 'SM46_micro000']

From this table, we could see we have 3 catogories of microstates with charge: -1, 0, 1, we need to calculte the energy and charge for each microstate first.

In [13]:
list(data_samp['ID tag '])

['SM25_micro003',
 'SM25_micro001',
 'SM25_micro002',
 'SM25_micro004',
 'SM25_micro005']

In [29]:
state = ['SM25_micro000']
charge = [0]
energy = [0]
for i in sorted(list(data_samp['ID tag '])):
    state.append(i)
    charge.append(int(data_samp[data['ID tag '] == i]['total charge']))
    energy.append(float(data_samp[data['ID tag '] == i]['pKa mean']))
print(state, charge, energy)

state_info = pd.DataFrame({'state_name':state, 'charge':charge, 'energy':energy})
state_info

['SM25_micro000', 'SM25_micro001', 'SM25_micro002', 'SM25_micro003', 'SM25_micro004', 'SM25_micro005'] [0, -1, 0, -1, 0, 1] [0, -6.66, -7.52, 7.91, -12.08, -2.33]




Unnamed: 0,charge,energy,state_name
0,0,0.0,SM25_micro000
1,-1,-6.66,SM25_micro001
2,0,-7.52,SM25_micro002
3,-1,7.91,SM25_micro003
4,0,-12.08,SM25_micro004
5,1,-2.33,SM25_micro005


In [30]:
state_info.charge.unique()
state_info[state_info['charge'] == 0]

Unnamed: 0,charge,energy,state_name
0,0,0.0,SM25_micro000
2,0,-7.52,SM25_micro002
4,0,-12.08,SM25_micro004


We have 3 different states with 0 charge, so we need to calculate the probabilty of each state using [Poisson–Boltzmann equation](https://en.wikipedia.org/wiki/Poisson%E2%80%93Boltzmann_equation)

In [31]:
# Modify the unit first from kcal/mol to kt
state_info['energy'] = state_info['energy']/1.688
state_info

Unnamed: 0,charge,energy,state_name
0,0,0.0,SM25_micro000
1,-1,-3.945498,SM25_micro001
2,0,-4.454976,SM25_micro002
3,-1,4.686019,SM25_micro003
4,0,-7.156398,SM25_micro004
5,1,-1.380332,SM25_micro005


Based on Possion Boltzmann equation, Pi = exp(-ΔE/KT)

In [36]:
# Calculate the probability of each state
state_info['Pi'] = np.exp(-state_info['energy'])
state_info

Unnamed: 0,charge,energy,state_name,Pi
0,0,0.0,SM25_micro000,1.0
1,-1,-3.945498,SM25_micro001,51.70206
2,0,-4.454976,SM25_micro002,86.054112
3,-1,4.686019,SM25_micro003,0.009223
4,0,-7.156398,SM25_micro004,1282.283952
5,1,-1.380332,SM25_micro005,3.976221


In [60]:
# Calculate the total prob for normalizing
state_info['Pi_total'] = state_info.groupby('charge').Pi.transform('sum')
state_info

Unnamed: 0,charge,energy,state_name,Pi,Pi_total
0,0,0.0,SM25_micro000,1.0,1369.338063
1,-1,-3.945498,SM25_micro001,51.70206,51.711284
2,0,-4.454976,SM25_micro002,86.054112,1369.338063
3,-1,4.686019,SM25_micro003,0.009223,51.711284
4,0,-7.156398,SM25_micro004,1282.283952,1369.338063
5,1,-1.380332,SM25_micro005,3.976221,3.976221


Transfer the prob to be normalized

In [61]:
state_info['Pi_norm'] = state_info['Pi']/state_info['Pi_total']

In [62]:
state_info

Unnamed: 0,charge,energy,state_name,Pi,Pi_total,Pi_norm
0,0,0.0,SM25_micro000,1.0,1369.338063,0.00073
1,-1,-3.945498,SM25_micro001,51.70206,51.711284,0.999822
2,0,-4.454976,SM25_micro002,86.054112,1369.338063,0.062844
3,-1,4.686019,SM25_micro003,0.009223,51.711284,0.000178
4,0,-7.156398,SM25_micro004,1282.283952,1369.338063,0.936426
5,1,-1.380332,SM25_micro005,3.976221,3.976221,1.0


In [63]:
state_info['energy_norm'] = state_info['Pi_norm']*state_info['energy']
state_info

Unnamed: 0,charge,energy,state_name,Pi,Pi_total,Pi_norm,energy_norm
0,0,0.0,SM25_micro000,1.0,1369.338063,0.00073,0.0
1,-1,-3.945498,SM25_micro001,51.70206,51.711284,0.999822,-3.944794
2,0,-4.454976,SM25_micro002,86.054112,1369.338063,0.062844,-0.279967
3,-1,4.686019,SM25_micro003,0.009223,51.711284,0.000178,0.000836
4,0,-7.156398,SM25_micro004,1282.283952,1369.338063,0.936426,-6.701438
5,1,-1.380332,SM25_micro005,3.976221,3.976221,1.0,-1.380332


G = U - TS, we already get the U, now we calulate the TS using TS = ∑(Pi)ln(Pi)

In [65]:
state_info['TS'] = -state_info['Pi_norm']*np.log(state_info['Pi_norm'])
state_info

Unnamed: 0,charge,energy,state_name,Pi,Pi_total,Pi_norm,energy_norm,TS
0,0,0.0,SM25_micro000,1.0,1369.338063,0.00073,0.0,0.005274
1,-1,-3.945498,SM25_micro001,51.70206,51.711284,0.999822,-3.944794,0.000178
2,0,-4.454976,SM25_micro002,86.054112,1369.338063,0.062844,-0.279967,0.173895
3,-1,4.686019,SM25_micro003,0.009223,51.711284,0.000178,0.000836,0.00154
4,0,-7.156398,SM25_micro004,1282.283952,1369.338063,0.936426,-6.701438,0.061509
5,1,-1.380332,SM25_micro005,3.976221,3.976221,1.0,-1.380332,-0.0


In [70]:
sum_energy = state_info.groupby('charge')[['energy_norm', 'TS']].sum().reset_index()
sum_energy

Unnamed: 0,charge,energy_norm,TS
0,-1,-3.943958,0.001718
1,0,-6.981405,0.240678
2,1,-1.380332,0.0


In [71]:
sum_energy['G'] = sum_energy['energy_norm'] - sum_energy['TS']
sum_energy

Unnamed: 0,charge,energy_norm,TS,G
0,-1,-3.943958,0.001718,-3.945676
1,0,-6.981405,0.240678,-7.222083
2,1,-1.380332,0.0,-1.380332


## Model for calculating all states in one file

In [79]:
def energy_calculation(data):
    data = data[['Molecule Id', 'ID tag ', 'total charge', 'pKa mean']]
    micro_list = list(data['Molecule Id'].unique())
    for micro in micro_list:
        data_samp = data[data['Molecule Id'] == micro]
        state = [micro]
        charge = [0]
        energy = [0]
        for i in sorted(list(data_samp['ID tag '])):
            state.append(i)
            charge.append(int(data_samp[data['ID tag '] == i]['total charge']))
            energy.append(float(data_samp[data['ID tag '] == i]['pKa mean']))

        state_info = pd.DataFrame({'state_name':state, 'charge':charge, 'energy':energy})
        state_info['energy'] = state_info['energy']/1.688
        state_info['Pi'] = np.exp(-state_info['energy'])
        state_info['Pi_total'] = state_info.groupby('charge').Pi.transform('sum')
        state_info['Pi_norm'] = state_info['Pi']/state_info['Pi_total']
        state_info['energy_norm'] = state_info['Pi_norm']*state_info['energy']
        state_info['TS'] = -state_info['Pi_norm']*np.log(state_info['Pi_norm'])
        sum_energy = state_info.groupby('charge')[['energy_norm', 'TS']].sum().reset_index()
        sum_energy['G'] = sum_energy['energy_norm'] - sum_energy['TS']
        sum_energy['state'] = micro[0:4]
        print(sum_energy)

In [84]:
data = pd.read_csv('data/pKa_prediction_Iorga_Beckstein_1.csv')
energy_calculation(data)



   charge  energy_norm        TS         G state
0      -1    -1.541228  0.072003 -1.613231  SM25
1       0    -3.096427  0.442401 -3.538829  SM25
2       1     0.385071  0.000000  0.385071  SM25
   charge  energy_norm        TS         G state
0      -1     1.828324  0.047432  1.780892  SM26
1       0     0.272189  0.388031 -0.115841  SM26
2       1     3.755924  0.000000  3.755924  SM26
   charge  energy_norm   TS         G state
0      -1     3.619668  0.0  3.619668  SM27
1       0     0.000000  0.0  0.000000  SM27
   charge  energy_norm        TS         G state
0      -1     5.217853  0.019231  5.198622  SM28
1       0     0.035902  0.043239 -0.007337  SM28
2       1     2.316351  0.000000  2.316351  SM28
   charge  energy_norm   TS     G state
0      -1         3.75  0.0  3.75  SM29
1       0         0.00  0.0  0.00  SM29
   charge  energy_norm   TS         G state
0      -1     3.471564  0.0  3.471564  SM30
1       0     0.000000  0.0  0.000000  SM30
   charge  energy_norm   TS 

## Calculate for all files

In [88]:
path = './data'

files = os.listdir(path)

for f in files:
    print('Calculating for file:', f)
    print('------------------------------------------------------------------')
    data = pd.read_csv(path + '/' + f)
    energy_calculation(data)
    print('------------------------------------------------------------------')
    

Calculating for file: pKa_RodriguezPaluch_SMD_3.csv
------------------------------------------------------------------
   charge  energy_norm        TS         G state
0      -1    -5.426438  0.000111 -5.426549  SM25
1       0    -7.579080  0.004367 -7.583447  SM25
   charge  energy_norm        TS          G state
0      -1   -13.097679  0.000731 -13.098410  SM26
1       0     0.000055  0.000059  -0.000004  SM26
   charge  energy_norm   TS       G state
0      -1      -8.2109  0.0 -8.2109  SM27
1       0       0.0000  0.0  0.0000  SM27
   charge  energy_norm        TS             G state
0      -1   -11.303318  0.000000 -1.130332e+01  SM28
1       0     0.000004  0.000004 -2.413622e-07  SM28
2       1     4.312796  0.000000  4.312796e+00  SM28
   charge  energy_norm   TS         G state
0      -1    -8.056872  0.0 -8.056872  SM29
1       0     0.000000  0.0  0.000000  SM29
   charge  energy_norm   TS        G state
0      -1     -7.64218  0.0 -7.64218  SM30
1       0      0.00000  0.0 



   charge  energy_norm   TS         G state
0      -1    -7.037915  0.0 -7.037915  SM33
1       0     0.000000  0.0  0.000000  SM33
   charge  energy_norm   TS         G state
0      -1    -8.501185  0.0 -8.501185  SM34
1       0     0.000000  0.0  0.000000  SM34
   charge  energy_norm        TS        G state
0      -1    -7.560965  0.603395 -8.16436  SM35
1       0    -0.437946  0.641004 -1.07895  SM35
   charge  energy_norm        TS         G state
0      -1    -5.639846  0.693130 -6.332975  SM36
1       0    -1.661023  0.384964 -2.045986  SM36
   charge  energy_norm        TS         G state
0      -1    -5.373223  0.693147 -6.066370  SM37
1       0    -1.539308  0.409400 -1.948709  SM37
2       1     4.755301  0.665840  4.089462  SM37
   charge  energy_norm   TS        G state
0      -1     -7.35782  0.0 -7.35782  SM38
1       0      0.00000  0.0  0.00000  SM38
   charge  energy_norm   TS         G state
0      -1    -6.018957  0.0 -6.018957  SM39
1       0     0.000000  0.0  0.0