In [None]:
! conda install -c rdkit rdkit -y
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
import pandas as pd
import os

In [None]:
df = pd.read_csv('PARP1_03_bioactivity_IC50_data_curated.csv')

In [None]:
df

In [None]:
class_ac = df['class'].value_counts()
class_ac.columns = ['class','count']
class_table = pd.DataFrame(class_ac)
class_table

In [None]:
class_table.to_csv('PARP1_bioactivity_profile_IC50.csv')

In [None]:
df_no_smiles = df.drop(columns='canonical_smiles')

In [None]:
smiles = []

for i in df.canonical_smiles.tolist():
  cpd = str(i).split('.')
  cpd_longest = max(cpd, key = len)
  smiles.append(cpd_longest)

smiles = pd.Series(smiles, name = 'canonical_smiles')

In [None]:
df_clean_smiles = pd.concat([df_no_smiles,smiles], axis=1)
df_clean_smiles

In [None]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [None]:
# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation

def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

In [None]:
df_lipinski = lipinski(df.canonical_smiles)
df_lipinski

In [None]:
df_combined = pd.concat([df,df_lipinski], axis=1)
df_combined

In [None]:
import numpy as np

def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x

In [None]:
df_combined.standard_value.describe()

In [None]:
-np.log10( (10**-9)* 100000000 )

In [None]:
-np.log10( (10**-9)* 10000000000 )

In [None]:
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)
        
    return x

In [None]:
df_norm = norm_value(df_combined)
df_norm

In [None]:
df_norm.standard_value_norm.describe()

In [None]:
df_final = pIC50(df_norm)
df_final

In [None]:
df_final = pIC50(df_norm)
df_final

In [None]:
df_final.pIC50.describe()

In [None]:
df_final.to_csv('PARP1_04_bioactivity_data_3class_pIC50.csv')

In [None]:
#Removing the 'intermediate' bioactivity class
df_2class = df_final[df_final['class']  != 'intermediate']
df_2class

In [None]:
df_2class.to_csv('PARP1_05_bioactivity_data_2class_pIC50.csv')

In [None]:
#df_2class = pd.read_csv('PARP1_05_bioactivity_data_2class_pIC50.csv', index_col=0)

In [None]:
#Statistical
mean_MW = df_2class.groupby('class')["MW"].mean()
SD_MW = df_2class.groupby('class')["MW"].std()
mean_LogP = df_2class.groupby('class')["LogP"].mean()
SD_LogP = df_2class.groupby('class')["LogP"].std()
mean_NumHDonors = df_2class.groupby('class')["NumHDonors"].mean()
SD_NumHDonors = df_2class.groupby('class')["NumHDonors"].std()
mean_NumHAcceptors = df_2class.groupby('class')["NumHAcceptors"].mean()
SD_NumHAcceptors = df_2class.groupby('class')["NumHAcceptors"].std()

In [None]:
total = [mean_MW,SD_MW,mean_LogP,SD_LogP,mean_NumHDonors,SD_NumHDonors,mean_NumHAcceptors,SD_NumHAcceptors]
label=['meanMWac', 'meanMWin','SDMWac', 'SDMWin',
     'meanLogPac', 'meanLogPin','SDLogPac','SDLogPin',
     'meanNumHDonorsac','meanNumHDonorsin','SDNumHDonorsac','SDNumHDonorsin',
    'meanNumHAcceptorsac', 'meanNumHAcceptorsin','SDNumHAcceptorsac','SDNumHAcceptorsin']

In [None]:
statistic_Ro5 = pd.concat(total, axis=0)
Ro5 = pd.DataFrame(statistic_Ro5, columns=["stat"])
Ro5['label']= label
Ro5

In [None]:
Ro5.to_csv("Ro5_stat.csv",index=False)

In [None]:
Ro5_statistic

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(7, 7))

sns.countplot(x='class', data=df_2class, edgecolor='black')

plt.xlabel('bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')

plt.savefig('plot_bioactivity_class.tiff', bbox_inches='tight', pad_inches=0.1)

In [None]:
plt.figure(figsize=(7, 7))

sns.scatterplot(x='MW', y='LogP', data=df_2class, hue='class', size='pIC50', edgecolor='black', alpha=0.7)
sns.set_style("whitegrid", {"ytick.major.size": 100,"xtick.major.size": 2, 'grid.linestyle': 'solid'})

plt.xlabel('MW', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

plt.savefig('plot_MW_vs_LogP.tiff', bbox_inches='tight', pad_inches=0.1)

In [None]:
plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'class', y = 'pIC50', data = df_2class, medianprops=dict(color="black", alpha=1, linewidth=2))

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')

plt.savefig('plot_IC50.tiff')

Statistical analysis | Mann-Whitney U Test

In [None]:
def mannwhitney(descriptor, verbose=False):
  # https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
  from numpy.random import seed
  from numpy.random import randn
  from scipy.stats import mannwhitneyu

# seed the random number generator
  seed(1)

# actives and inactives
  selection = [descriptor, 'class']
  df = df_2class[selection]
  active = df[df['class'] == 'active']
  active = active[descriptor]

  selection = [descriptor, 'class']
  df = df_2class[selection]
  inactive = df[df['class'] == 'inactive']
  inactive = inactive[descriptor]

# compare samples
  stat, p = mannwhitneyu(active, inactive)

# interpret
  alpha = 0.05
  if p > alpha:
    interpretation = 'Same distribution (fail to reject H0)'
  else:
    interpretation = 'Different distribution (reject H0)'
  
  results = pd.DataFrame({'Descriptor':descriptor,
                          'Statistics':stat,
                          'p':p,
                          'alpha':alpha,
                          'Interpretation':interpretation}, index=[0])
  filename = 'mannwhitneyu_' + descriptor + '.csv'
  results.to_csv(filename)

  return results

In [None]:
mannwhitney('pIC50')

In [None]:
pip install statannotations

In [None]:
pip install statsmodels

In [None]:
from statannotations.Annotator import Annotator

In [None]:
plt.figure(figsize=(5.5, 5.5))

ax = sns.boxplot(x = 'class', y = 'MW', data = df_2class, showmeans = True, meanprops={"marker":"o","markerfacecolor":"white", 
                       "markeredgecolor":"black","markersize":"8"},
                     medianprops=dict(color="black", alpha=1, linewidth=3))
ax.axhline(500, ls='--',c = 'black')
ax.set(ylim=(100, 850))
ax.set(xlabel=None)

plt.ylabel('MW', fontsize=14, fontweight='bold')

plt.savefig('plot_MW.tiff')

In [None]:
mannwhitney('MW')

LogP

In [None]:
plt.figure(figsize=(5.5, 5.5))

ax = sns.boxplot(x = 'class', y = 'LogP', data = df_2class, showmeans = True, meanprops={"marker":"o","markerfacecolor":"white", 
                       "markeredgecolor":"black","markersize":"8"},
                     medianprops=dict(color="black", alpha=1, linewidth=3))
ax.axhline(5, ls='--',c = 'black')
ax.set(ylim=(-6, 12))
ax.set(xlabel=None)

plt.ylabel('LogP', fontsize=14, fontweight='bold')

plt.savefig('plot_LogP.tiff')

In [None]:
mannwhitney('LogP')

NumHDonor

In [None]:
plt.figure(figsize=(5.5, 5.5))

ax = sns.boxplot(x = 'class', y = 'NumHDonors', data = df_2class, 
                 showmeans = True, meanprops={"marker":"o","markerfacecolor":"white", 
                       "markeredgecolor":"black","markersize":"8"},
                     medianprops=dict(color="black", alpha=1, linewidth=3))
ax.axhline(5, ls='--',c = 'black')
ax.set(ylim=(-0.5,10.5))
ax.set(xlabel=None)

plt.ylabel('NumHDonors', fontsize=14, fontweight='bold')
               
plt.savefig('plot_NumHDonors.tiff')

In [None]:
mannwhitney('NumHDonors')

NumHAcceptors

In [None]:
plt.figure(figsize=(5.5, 5.5))

ax = sns.boxplot(x = 'class', y = 'NumHAcceptors', data = df_2class, showmeans = True, meanprops={"marker":"o","markerfacecolor":"white", 
                       "markeredgecolor":"black","markersize":"8"},
                     medianprops=dict(color="black", alpha=1, linewidth=3))
ax.axhline(10, ls='--',c = 'black')
ax.set(ylim=(0, 20))
ax.set(xlabel=None)

plt.ylabel('NumHAcceptors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHAcceptors.tiff')

In [None]:
mannwhitney('NumHAcceptors')

In [None]:
!ls

In [None]:
filesname=os.listdir()
filesname

In [None]:
from zipfile import ZipFile
filesname=os.listdir('H:\\Shared drives')

with ZipFile('output.zip', 'w') as myzip:
    for file in filesname:
        if file.endswith(".csv") and file.endswith(".tiff"):
            myzip.write(file)

In [None]:
!zip -r results.zip . -i *.csv *.tiff

In [None]:
# create a ZipFile object
zipObj = ZipFile('data_exploratory.zip', 'w')
# Add multiple files to the zip
zipObj.write('*.csv')
# close the Zip File
zipObj.close()