# Pubchempy is a library to automate searches to Pubchem

We will import it to use it.  We're going to refer to it as its nickname, "pcp"

In [2]:
# First, install the pubchempy package
!pip install pubchempy

# Then import it
import pubchempy as pcp

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pubchempy
[33m  DEPRECATION: Building 'pubchempy' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'pubchempy'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
  Building wheel for pubchempy (setup.py) ... [?25ldone
[?25h  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13818 sha256=8eafcf15a701ece3f1357e173ca7573d50f435d7091b91e2aa65d151f1cf0853
  Stored in directory: /Users/tosapornsattasathuchana/Library/Caches/pip/wheels/de/2c/0f/eacc36a5f674b71ef0d58227c40759f56a0b8332b0c9880

First take a look at https://pubchem.ncbi.nlm.nih.gov/
Look at penicillin, see the 2d/3d image, etc.

# Let's do the same thing, in python.  This is way faster than searching by hand if you're examining 1 million molecules
Search by name (could be name, smiles, formula) (also inchi, inchikey, sdf)

In [3]:
results = pcp.get_compounds('penicillin','name')

In [4]:
print (results)

[Compound(52946032)]


What is this?  A list of results, there's only 1 compound in the list.
Let's assign that one compound to a variable

In [5]:
penicillin = results[0]

what can we get out of this result?  use penicillin.<tab>

In [6]:
penicillin

Compound(52946032)

In [7]:
penicillin.cid

52946032

In [8]:
penicillin.canonical_smiles

'CC1(C(N2CC(C2S1)NC(=O)CC3=CC=CC=C3)C(=O)O)C'

In [9]:
penicillin.isomeric_smiles

'CC1([C@@H](N2C[C@H]([C@H]2S1)NC(=O)CC3=CC=CC=C3)C(=O)O)C'

In [10]:
penicillin.iupac_name

'(2S,5R,6R)-3,3-dimethyl-6-[(2-phenylacetyl)amino]-4-thia-1-azabicyclo[3.2.0]heptane-2-carboxylic acid'

In [11]:
penicillin.atoms

[Atom(1, S),
 Atom(2, O),
 Atom(3, O),
 Atom(4, O),
 Atom(5, N),
 Atom(6, N),
 Atom(7, C),
 Atom(8, C),
 Atom(9, C),
 Atom(10, C),
 Atom(11, C),
 Atom(12, C),
 Atom(13, C),
 Atom(14, C),
 Atom(15, C),
 Atom(16, C),
 Atom(17, C),
 Atom(18, C),
 Atom(19, C),
 Atom(20, C),
 Atom(21, C),
 Atom(22, C),
 Atom(23, H),
 Atom(24, H),
 Atom(25, H),
 Atom(26, H),
 Atom(27, H),
 Atom(28, H),
 Atom(29, H),
 Atom(30, H),
 Atom(31, H),
 Atom(32, H),
 Atom(33, H),
 Atom(34, H),
 Atom(35, H),
 Atom(36, H),
 Atom(37, H),
 Atom(38, H),
 Atom(39, H),
 Atom(40, H),
 Atom(41, H),
 Atom(42, H)]

In [12]:
len(penicillin.atoms)

42

In [13]:
penicillin.bonds

[Bond(1, 7, 1),
 Bond(1, 11, 1),
 Bond(2, 14, 1),
 Bond(2, 35, 1),
 Bond(3, 14, 2),
 Bond(4, 15, 2),
 Bond(5, 7, 1),
 Bond(5, 9, 1),
 Bond(5, 10, 1),
 Bond(6, 8, 1),
 Bond(6, 15, 1),
 Bond(6, 34, 1),
 Bond(7, 8, 1),
 Bond(7, 23, 1),
 Bond(8, 10, 1),
 Bond(8, 24, 1),
 Bond(9, 11, 1),
 Bond(9, 14, 1),
 Bond(9, 25, 1),
 Bond(10, 26, 1),
 Bond(10, 27, 1),
 Bond(11, 12, 1),
 Bond(11, 13, 1),
 Bond(12, 28, 1),
 Bond(12, 29, 1),
 Bond(12, 30, 1),
 Bond(13, 31, 1),
 Bond(13, 32, 1),
 Bond(13, 33, 1),
 Bond(15, 16, 1),
 Bond(16, 17, 1),
 Bond(16, 36, 1),
 Bond(16, 37, 1),
 Bond(17, 18, 2),
 Bond(17, 19, 1),
 Bond(18, 20, 1),
 Bond(18, 38, 1),
 Bond(19, 21, 2),
 Bond(19, 39, 1),
 Bond(20, 22, 2),
 Bond(20, 40, 1),
 Bond(21, 22, 1),
 Bond(21, 41, 1),
 Bond(22, 42, 1)]

In [14]:
penicillin.molecular_formula

'C16H20N2O3S'

In [15]:
penicillin.molecular_weight

'320.4'

In [16]:
penicillin.record


{'id': {'id': {'cid': 52946032}},
 'atoms': {'aid': [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42],
  'element': [16,
   8,
   8,
   8,
   7,
   7,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]},
 'bonds': {'aid1': [1,
   1,
   2,
   2,
   3,
   4,
   5,
   5,
   5,
   6,
   6,
   6,
   7,
   7,
   8,
   8,
   9,
   9,
   9,
   10,
   10,
   11,
   11,
   12,
   12,
   12,
   13,
   13,
   13,
   15,
   16,
   16,
   16,
   17,
   17,
   18,
   18,
   19,
   19,
   20,
   20,
   21,
   21,
   22],
  'aid2': [7,
   11,
   14,
   35,
   14,
   15,
   7,
   9,
   10,
   8,
   15,
   34,
   

In [17]:
results = pcp.get_compounds('penicillin','name', record_type='3d')
penicillin3d = results[0]

In [18]:
penicillin3d.atoms

[Atom(1, S),
 Atom(2, O),
 Atom(3, O),
 Atom(4, O),
 Atom(5, N),
 Atom(6, N),
 Atom(7, C),
 Atom(8, C),
 Atom(9, C),
 Atom(10, C),
 Atom(11, C),
 Atom(12, C),
 Atom(13, C),
 Atom(14, C),
 Atom(15, C),
 Atom(16, C),
 Atom(17, C),
 Atom(18, C),
 Atom(19, C),
 Atom(20, C),
 Atom(21, C),
 Atom(22, C),
 Atom(23, H),
 Atom(24, H),
 Atom(25, H),
 Atom(26, H),
 Atom(27, H),
 Atom(28, H),
 Atom(29, H),
 Atom(30, H),
 Atom(31, H),
 Atom(32, H),
 Atom(33, H),
 Atom(34, H),
 Atom(35, H),
 Atom(36, H),
 Atom(37, H),
 Atom(38, H),
 Atom(39, H),
 Atom(40, H),
 Atom(41, H),
 Atom(42, H)]

In [19]:
penicillin3d.atoms[0]

Atom(1, S)

This is an object (uppercase, looks weird).  objects have extra stuff if you use the dot.  Let's see what it has:

# print some information

In [20]:
for atom in penicillin3d.atoms:
    print (f"{atom.element}, {atom.x}, {atom.y}, {atom.z}")

S, 2.6826, -1.1919, -1.7048
O, 1.4653, 2.7152, 0.7974
O, 3.2411, 1.8058, 1.8997
O, -1.6554, -2.3145, 1.1331
N, 2.5149, -0.6582, 0.9748
N, 0.0054, -1.7908, -0.4004
C, 2.5739, -1.796, 0.0221
C, 1.1441, -2.1947, 0.4294
C, 2.0809, 0.5244, 0.2071
C, 1.3763, -1.3089, 1.6646
C, 2.8566, 0.5437, -1.1261
C, 2.2148, 1.4794, -2.1532
C, 4.3557, 0.8506, -0.9895
C, 2.3503, 1.7246, 1.0671
C, -1.3162, -1.888, 0.0322
C, -2.2948, -1.4076, -1.0215
C, -3.1451, -0.2524, -0.5631
C, -4.3672, -0.4917, 0.0652
C, -2.7104, 1.0571, -0.7669
C, -5.1547, 0.5786, 0.4897
C, -3.4978, 2.1274, -0.3425
C, -4.7201, 1.8881, 0.2857
H, 3.3364, -2.5394, 0.2809
H, 1.0597, -3.2618, 0.6726
H, 0.9989, 0.5124, 0.0135
H, 0.5584, -0.6488, 1.9745
H, 1.6879, -1.8765, 2.5508
H, 1.1475, 1.2679, -2.2876
H, 2.303, 2.5249, -1.8378
H, 2.6999, 1.3965, -3.133
H, 4.5141, 1.886, -0.667
H, 4.8547, 0.2024, -0.2605
H, 4.8759, 0.7359, -1.9483
H, 0.1815, -1.4274, -1.3338
H, 1.6249, 3.5097, 1.3501
H, -2.9322, -2.2608, -1.2873
H, -1.7805, -1.1326, -1.95

# search for similar structures:
penicillin has a beta-lactam (4 member ring) with thiazolidine (5-member_ ring)

In [21]:
results = pcp.get_compounds(penicillin.cid, searchtype='similarity')

In [22]:
print (results)

[Compound(5904), Compound(6249), Compound(23664709), Compound(23663979), Compound(23668834), Compound(23565), Compound(52946032), Compound(640429), Compound(2349), Compound(6713928), Compound(10250769), Compound(15574941), Compound(2174), Compound(7048611), Compound(39031), Compound(445702), Compound(65054), Compound(443984), Compound(25195407), Compound(255293), Compound(98994), Compound(517573), Compound(20055036), Compound(2825766), Compound(24746778), Compound(71797), Compound(448911), Compound(23682214), Compound(71796), Compound(6914605), Compound(5244176), Compound(3246457), Compound(3085040), Compound(197739), Compound(197868), Compound(23699504), Compound(44134885), Compound(3034014), Compound(23701796), Compound(16051961), Compound(23684994), Compound(23716482), Compound(4084), Compound(107496), Compound(46174099), Compound(101718), Compound(121494077), Compound(199209), Compound(6604512), Compound(139593604), Compound(23689391), Compound(10363242), Compound(44377540), Compou

# print the common name for the first 20 compounds

In [23]:
for compound in results[:20]:
    if compound.synonyms:
        print (compound.synonyms[0])
    else:
        print (compound.iupac_name)
    


penicillin g
ampicillin
Penicillin G potassium
AMPICILLIN SODIUM
Penicillin G sodium
AMPICILLIN TRIHYDRATE
1406-05-9
PICIBANIL
Cillin
metampicillin
PENAMECILLIN
Azidocillin
Alpen
(2S,5R,6R)-6-[[(2R)-2-azaniumyl-2-phenylacetyl]amino]-3,3-dimethyl-7-oxo-4-thia-1-azabicyclo[3.2.0]heptane-2-carboxylate
Sulbenicilina
D-benzylpenicilloic acid
Benethamine penicillin
sulbenicillin sodium
METAMPICILLIN SODIUM
Penilloic acid


# make an xyz file to feed into a program like GAMESS.

The format of an xyz file is:

number of atoms
blank line
element x y z

In [24]:
penicillinStr = f"{len(penicillin3d.atoms)}\n\n"
for atom in penicillin3d.atoms:
    penicillinStr += f"{atom.element}     {atom.x}     {atom.y}     {atom.z}\n"
print (penicillinStr)

42

S     2.6826     -1.1919     -1.7048
O     1.4653     2.7152     0.7974
O     3.2411     1.8058     1.8997
O     -1.6554     -2.3145     1.1331
N     2.5149     -0.6582     0.9748
N     0.0054     -1.7908     -0.4004
C     2.5739     -1.796     0.0221
C     1.1441     -2.1947     0.4294
C     2.0809     0.5244     0.2071
C     1.3763     -1.3089     1.6646
C     2.8566     0.5437     -1.1261
C     2.2148     1.4794     -2.1532
C     4.3557     0.8506     -0.9895
C     2.3503     1.7246     1.0671
C     -1.3162     -1.888     0.0322
C     -2.2948     -1.4076     -1.0215
C     -3.1451     -0.2524     -0.5631
C     -4.3672     -0.4917     0.0652
C     -2.7104     1.0571     -0.7669
C     -5.1547     0.5786     0.4897
C     -3.4978     2.1274     -0.3425
C     -4.7201     1.8881     0.2857
H     3.3364     -2.5394     0.2809
H     1.0597     -3.2618     0.6726
H     0.9989     0.5124     0.0135
H     0.5584     -0.6488     1.9745
H     1.6879     -1.8765     2.5508
H     1.1475     1.2

# view the xyz file with py3Dmol

In [28]:
# First, install the missing package
!pip install py3Dmol

# Then import the required libraries
import py3Dmol
import ipywidgets as widgets
from IPython.display import display



In [29]:
# Create a view instance with default settings
view=py3Dmol.view(width=400, height=400)

# Then use the instance to add the model
view.addModel(penicillinStr, 'xyz')

# Set visualization style (optional)
view.setStyle({'stick':{}})

# Render the view
view.zoomTo()
view.show()

# view one of the similar compounds we found earlier

In [30]:
results = pcp.get_compounds('sulbenicillin','name', record_type='3d')
sulbenicillin3d = results[0]
sulbenicillinStr = f"{len(sulbenicillin3d.atoms)}\n\n"
for atom in sulbenicillin3d.atoms:
    sulbenicillinStr += f"{atom.element}     {atom.x}     {atom.y}     {atom.z}\n"
print (sulbenicillinStr)

45

S     -1.526     1.3227     0.6352
S     4.0938     -1.6318     -0.2337
O     -2.3604     -2.5948     -1.2003
O     -5.7857     0.0791     -1.4836
O     -5.6281     -0.7216     0.6447
O     1.7316     -1.5796     1.6338
O     3.4259     -3.1191     -0.2589
O     4.613     -1.374     1.0959
O     4.9165     -1.4688     -1.4174
N     -2.9136     -0.8394     0.2772
N     0.2367     -1.1021     -0.0742
C     -1.8725     -0.3823     1.2025
C     -3.2899     1.5365     0.0763
C     -3.67     0.1439     -0.4713
C     -0.9757     -1.4936     0.6401
C     -2.1691     -1.8434     -0.2711
C     -3.3382     2.5999     -1.0222
C     -4.1167     1.9924     1.2863
C     -5.1149     -0.2205     -0.3446
C     1.5108     -1.1794     0.4928
C     2.6009     -0.7093     -0.4511
C     2.9498     0.7493     -0.2821
C     3.4065     1.4882     -1.3736
C     2.8151     1.3588     0.9655
C     3.7288     2.8363     -1.2175
C     3.1373     2.7069     1.1214
C     3.5941     3.4456     0.0298
H     -2.162  

In [32]:
# Create a view instance with default settings
view2=py3Dmol.view()
view2.addModel(sulbenicillinStr, 'xyz')

# Set visualization style (optional)
view2.setStyle({'stick':{}})

# Render the view
view2.zoomTo()
view2.show()