diff --git a/licences/placeholder b/licences/.gitkeep similarity index 100% rename from licences/placeholder rename to licences/.gitkeep diff --git a/requirements.txt b/requirements.txt index cd20fb0d..587c0fea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ tqdm seaborn plotly requests +mordred ###### Requirements with Version Specifiers ###### numpy == 1.16.* diff --git a/tests/extend_descriptors/descriptor/test_mordred.py b/tests/extend_descriptors/descriptor/test_mordred.py new file mode 100644 index 00000000..5d6cde2a --- /dev/null +++ b/tests/extend_descriptors/descriptor/test_mordred.py @@ -0,0 +1,60 @@ +# Copyright (c) 2019. TsumiNa. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import pandas as pd +import pytest +from mordred._base.pandas_module import MordredDataFrame +from rdkit import Chem + +from xenonpy.contrib.extend_descriptors.descriptor import Mordred2DDescriptor + + +@pytest.fixture(scope='module') +def data(): + # ignore numpy warning + import warnings + print('ignore NumPy RuntimeWarning\n') + warnings.filterwarnings("ignore", message="numpy.dtype size changed") + warnings.filterwarnings("ignore", message="numpy.ndarray size changed") + + smis = ['C(C(O)C1(O))C(CO)OC1O', + 'CC(C1=CC=CC=C1)CC(C2=CC=CC=C2)CC(C3=CC=CC=C3)CC(C4=CC=CC=C4)', + ' CC(C)CC(C)CC(C)', + 'C(F)C(F)(F)'] + + mols = [Chem.MolFromSmiles(s) for s in smis] + + err_smis = ['C(C(O)C1(O))C(CO)OC1O', + 'CC(C1=CC=CC=C1)CC(C2=CC=CC=C2)CC(C3=CC=', + 'Ccccccc', + 'C(F)C(F)(F)'] + yield dict(smis=smis, mols=mols, err_smis=err_smis) + + print('test over') + + +def test_mordred_1(data): + mordred = Mordred2DDescriptor() + desc = mordred.transform(data['smis']) + assert isinstance(desc, MordredDataFrame) + + mordred = Mordred2DDescriptor(return_type='df') + desc = mordred.transform(data['smis']) + assert isinstance(desc, pd.DataFrame) + + +def test_mordred_2(data): + mordred = Mordred2DDescriptor() + desc = mordred.transform(data['mols']) + assert isinstance(desc, MordredDataFrame) + + +def test_mordred_3(data): + mordred = Mordred2DDescriptor() + with pytest.raises(ValueError): + mordred.transform(data['err_smis']) + + +if __name__ == "__main__": + pytest.main() diff --git a/tests/foo/descriptor/test_foo.py b/tests/foo/descriptor/test_foo.py index 3a388599..693d6838 100644 --- a/tests/foo/descriptor/test_foo.py +++ b/tests/foo/descriptor/test_foo.py @@ -2,8 +2,14 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -from xenonpy.contrib.foo import hello_contrib +import pytest + +from xenonpy.contrib.foo.descriptor import hello_contrib def test_foo_1(): assert hello_contrib() == 'Hello contribution!' + + +if __name__ == "__main__": + pytest.main() diff --git a/travis/linux-win/py36.yml b/travis/linux-win/py36.yml index e4b3af0d..8d630c7f 100644 --- a/travis/linux-win/py36.yml +++ b/travis/linux-win/py36.yml @@ -18,5 +18,6 @@ dependencies: - pip - pip: - ruamel.yaml + - mordred - pymatgen==2019.5.8 - tqdm diff --git a/travis/linux-win/py37.yml b/travis/linux-win/py37.yml index 48707e52..33d29396 100644 --- a/travis/linux-win/py37.yml +++ b/travis/linux-win/py37.yml @@ -18,5 +18,6 @@ dependencies: - pip - pip: - ruamel.yaml + - mordred - pymatgen==2019.5.8 - tqdm diff --git a/travis/osx/py36.yml b/travis/osx/py36.yml index 55efae91..0ee2139e 100644 --- a/travis/osx/py36.yml +++ b/travis/osx/py36.yml @@ -18,5 +18,6 @@ dependencies: - pip - pip: - ruamel.yaml + - mordred - pymatgen==2019.5.8 - tqdm diff --git a/travis/osx/py37.yml b/travis/osx/py37.yml index 21d512df..4031b00b 100644 --- a/travis/osx/py37.yml +++ b/travis/osx/py37.yml @@ -18,5 +18,6 @@ dependencies: - pip - pip: - ruamel.yaml + - mordred - pymatgen==2019.5.8 - tqdm diff --git a/xenonpy/contrib/.DS_Store b/xenonpy/contrib/.DS_Store new file mode 100644 index 00000000..36e75921 Binary files /dev/null and b/xenonpy/contrib/.DS_Store differ diff --git a/xenonpy/contrib/README.md b/xenonpy/contrib/README.md index 8e18d3fc..beb9db29 100644 --- a/xenonpy/contrib/README.md +++ b/xenonpy/contrib/README.md @@ -9,10 +9,11 @@ get merged into XenonPy, but whose interfaces may still change, or which require some testing to see whether they can find broader acceptance. When adding a project, please stick to the following directory structure: -Create a project directory in `contrib/`, and mirror the portions of the -TensorFlow tree that your project requires underneath `contrib/my_project/`. +1. Create a project directory in `contrib/`, and mirror the portions of the XenonPy tree that your project requires underneath `contrib/my_project/`. +2. Provide a `README.md` under the root of the project directory, e.g `contrib/my_project/README.md`. -For example, let's say you create foo in `foo.py` and the testing codes + +For example, let's say you create a project named `foo` with source file `foo.py` and the testing file `foo_test.py`. If you were to merge those files directly into XenonPy, they would live in `$ROOT/xenonpy/descriptor/foo.py` and `$ROOT/tests/descriptor/foo_test.py`. In `contrib/`, they are part diff --git a/xenonpy/contrib/extend_descriptors/README.md b/xenonpy/contrib/extend_descriptors/README.md new file mode 100644 index 00000000..37ca835f --- /dev/null +++ b/xenonpy/contrib/extend_descriptors/README.md @@ -0,0 +1,20 @@ +# Extend Descriptors + +## FrozenFeaturizerDescriptor + +This is a sample code for creating artificial descriptor based on a trained neural network. +This code creates a BaseFeaturizer object in XenonPy that can be used as input for training models. +The input is in the same format as the input of the descriptor used in the neural network. + +By passing both the XenonPy descriptor object and XenonPy frozen featurizer object into this class when creating the Base Featurizer, the output will be a dataframe same as other typical XenonPy descriptors, while the number of columns is the number of neurons in the chosen hidden layers. + + +## Mordred2DDescriptor + +This is a sample code for calculating the 2D Mordred descriptor: +https://github.com/mordred-descriptor/mordred + +This code creates a BaseFeaturizer object in XenonPy that can be used as input for training models. + +----------- +written by Stephen Wu, 2019.05.31 diff --git a/xenonpy/contrib/extend_descriptors/__init__.py b/xenonpy/contrib/extend_descriptors/__init__.py new file mode 100644 index 00000000..4a5d2d48 --- /dev/null +++ b/xenonpy/contrib/extend_descriptors/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2019. TsumiNa. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/xenonpy/contrib/extend_descriptors/descriptor/__init__.py b/xenonpy/contrib/extend_descriptors/descriptor/__init__.py new file mode 100644 index 00000000..b183f8d1 --- /dev/null +++ b/xenonpy/contrib/extend_descriptors/descriptor/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2019. TsumiNa. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from .frozen_featurizer_descriptor import FrozenFeaturizerDescriptor +from .mordred_descriptor import Mordred2DDescriptor diff --git a/xenonpy/contrib/extend_descriptors/descriptor/frozen_featurizer_descriptor.py b/xenonpy/contrib/extend_descriptors/descriptor/frozen_featurizer_descriptor.py new file mode 100644 index 00000000..1300d7a8 --- /dev/null +++ b/xenonpy/contrib/extend_descriptors/descriptor/frozen_featurizer_descriptor.py @@ -0,0 +1,45 @@ +# Copyright (c) 2019. TsumiNa. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from typing import Union + +from xenonpy.descriptor import FrozenFeaturizer +from xenonpy.descriptor.base import BaseFeaturizer, BaseDescriptor + + +class FrozenFeaturizerDescriptor(BaseFeaturizer): + + def __init__(self, descriptor_calculator: Union[BaseDescriptor, BaseFeaturizer], + frozen_featurizer: FrozenFeaturizer, *, + on_errors='raise', + return_type='any'): + """ + A featurizer for extracting artificial descriptors from neural networks + + Parameters + ---------- + descriptor_calculator : BaseFeaturizer or BaseDescriptor + Convert input data into descriptors to keep consistency with the pre-trained model. + frozen_featurizer : FrozenFeaturizer + Extracting artificial descriptors from neural networks + """ + + # fix n_jobs to be 0 to skip automatic wrapper in XenonPy BaseFeaturizer class + super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type) + self.FP = descriptor_calculator + self.ff = frozen_featurizer + self.output = None + self.__authors__ = ['Stephen Wu', 'TsumiNa'] + + def featurize(self, x, *, depth=1): + # transform input to descriptor dataframe + tmp_df = self.FP.transform(x) + # convert descriptor dataframe to hidden layer dataframe + self.output = self.ff.transform(tmp_df, depth=depth, return_type='df') + return self.output + + @property + def feature_labels(self): + # column names based on xenonpy frozen featurizer setting + return self.output.columns diff --git a/xenonpy/contrib/extend_descriptors/descriptor/mordred_descriptor.py b/xenonpy/contrib/extend_descriptors/descriptor/mordred_descriptor.py new file mode 100644 index 00000000..210c1bdb --- /dev/null +++ b/xenonpy/contrib/extend_descriptors/descriptor/mordred_descriptor.py @@ -0,0 +1,39 @@ +# Copyright (c) 2019. TsumiNa. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from mordred import Calculator, descriptors +from rdkit import Chem + +from xenonpy.descriptor.base import BaseFeaturizer + + +class Mordred2DDescriptor(BaseFeaturizer): + + def __init__(self, *, on_errors='raise', return_type='any'): + # fix n_jobs to be 0 to skip automatic wrapper in XenonPy BaseFeaturizer class + super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type) + self.output = None + self.__authors__ = ['Stephen Wu', 'TsumiNa'] + + def featurize(self, x): + # check if type(x) = list + if not isinstance(x, (list,)): + x = [x] + # check input format, assume SMILES if not RDKit-MOL + if not isinstance(x[0], Chem.rdchem.Mol): + x_mol = [] + for z in x: + x_mol.append(Chem.MolFromSmiles(z)) + if x_mol[-1] is None: + raise ValueError('can not convert Mol from SMILES %s' % z) + else: + x_mol = x + + calc = Calculator(descriptors, ignore_3D=True) + self.output = calc.pandas(x_mol) + return self.output + + @property + def feature_labels(self): + return self.output.columns diff --git a/xenonpy/contrib/foo/__init__.py b/xenonpy/contrib/foo/__init__.py index d801c4f8..4a5d2d48 100644 --- a/xenonpy/contrib/foo/__init__.py +++ b/xenonpy/contrib/foo/__init__.py @@ -1,5 +1,3 @@ # Copyright (c) 2019. TsumiNa. All rights reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. - -from .foo import hello_contrib diff --git a/xenonpy/contrib/foo/descriptor/__init__.py b/xenonpy/contrib/foo/descriptor/__init__.py new file mode 100644 index 00000000..609acf4b --- /dev/null +++ b/xenonpy/contrib/foo/descriptor/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2019. yoshida-lab. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from .foo import hello_contrib diff --git a/xenonpy/contrib/foo/foo.py b/xenonpy/contrib/foo/descriptor/foo.py similarity index 73% rename from xenonpy/contrib/foo/foo.py rename to xenonpy/contrib/foo/descriptor/foo.py index b6ca1ca5..bfe05663 100644 --- a/xenonpy/contrib/foo/foo.py +++ b/xenonpy/contrib/foo/descriptor/foo.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019. TsumiNa. All rights reserved. +# Copyright (c) 2019. yoshida-lab. All rights reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index 1e141887..24d3c631 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -36,6 +36,7 @@ def __init__(self, *, one_hot_vec=False, n_jobs=-1, on_errors='raise', return_ty super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type) self.one_hot_vec = one_hot_vec self._elems = self._elements.index.tolist() + self.__authors__ = ['TsumiNa'] def mix_function(self, elems, nums): vec = np.zeros(len(self._elems), dtype=np.int) diff --git a/xenonpy/descriptor/fingerprint.py b/xenonpy/descriptor/fingerprint.py index d0699712..25065e80 100644 --- a/xenonpy/descriptor/fingerprint.py +++ b/xenonpy/descriptor/fingerprint.py @@ -21,7 +21,7 @@ def __init__(self, n_jobs=-1, *, fp_size=2048, input_type='mol', on_errors='rais ---------- n_jobs: int The number of jobs to run in parallel for both fit and predict. - Set -1 to use all cpu cores (default). + Can be -1 or # of cups. Set -1 to use all cpu cores (default). fp_size: int Fingerprint size. input_type: string @@ -41,6 +41,7 @@ def __init__(self, n_jobs=-1, *, fp_size=2048, input_type='mol', on_errors='rais super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type) self.input_type = input_type self.fp_size = fp_size + self.__authors__ = ['Stephen Wu', 'TsumiNa'] def featurize(self, x): if self.input_type == 'smiles': @@ -77,7 +78,7 @@ def __init__(self, n_jobs=-1, *, n_bits=2048, input_type='mol', on_errors='raise ---------- n_jobs: int The number of jobs to run in parallel for both fit and predict. - Set -1 to use all cpu cores (default). + Can be -1 or # of cups. Set -1 to use all cpu cores (default). n_bits: int Fixed bit length based on folding. input_type: string @@ -97,6 +98,7 @@ def __init__(self, n_jobs=-1, *, n_bits=2048, input_type='mol', on_errors='raise super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type) self.input_type = input_type self.n_bits = n_bits + self.__authors__ = ['Stephen Wu', 'TsumiNa'] def featurize(self, x): if self.input_type == 'smiles': @@ -128,7 +130,8 @@ def __init__(self, n_jobs=-1, *, n_bits=2048, input_type='mol', on_errors='raise Parameters ---------- n_jobs: int - The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). + The number of jobs to run in parallel for both fit and predict. + Can be -1 or # of cups. Set -1 to use all cpu cores (default). n_bits: int Fixed bit length based on folding. input_type: string @@ -148,6 +151,7 @@ def __init__(self, n_jobs=-1, *, n_bits=2048, input_type='mol', on_errors='raise super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type) self.input_type = input_type self.n_bits = n_bits + self.__authors__ = ['Stephen Wu', 'TsumiNa'] def featurize(self, x): if self.input_type == 'smiles': @@ -178,7 +182,8 @@ def __init__(self, n_jobs=-1, *, input_type='mol', on_errors='raise', return_typ Parameters ---------- n_jobs: int - The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). + The number of jobs to run in parallel for both fit and predict. + Can be -1 or # of cups. Set -1 to use all cpu cores (default). input_type: string Set the specific type of transform input. Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input. @@ -195,6 +200,7 @@ def __init__(self, n_jobs=-1, *, input_type='mol', on_errors='raise', return_typ """ super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type) self.input_type = input_type + self.__authors__ = ['Stephen Wu', 'TsumiNa'] def featurize(self, x): if self.input_type == 'smiles': @@ -226,7 +232,8 @@ def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, input_type='mol', on_err Parameters ---------- n_jobs: int - The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). + The number of jobs to run in parallel for both fit and predict. + Can be -1 or # of cups. Set -1 to use all cpu cores (default). radius: int The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in FCFP, i.e., radius=2 is roughly equivalent to FCFP4. @@ -250,6 +257,7 @@ def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, input_type='mol', on_err self.input_type = input_type self.radius = radius self.n_bits = n_bits + self.__authors__ = ['Stephen Wu', 'TsumiNa'] # self.arg = arg # arg[0] = radius, arg[1] = bit length def featurize(self, x): @@ -284,7 +292,8 @@ def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, input_type='mol', on_err Parameters ---------- n_jobs: int - The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). + The number of jobs to run in parallel for both fit and predict. + Can be -1 or # of cups. Set -1 to use all cpu cores (default). radius: int The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in ECFP, i.e., radius=2 is roughly equivalent to ECFP4. @@ -308,6 +317,7 @@ def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, input_type='mol', on_err self.input_type = input_type self.radius = radius self.n_bits = n_bits + self.__authors__ = ['Stephen Wu', 'TsumiNa'] # self.arg = arg # arg[0] = radius, arg[1] = bit length def featurize(self, x): @@ -339,7 +349,8 @@ def __init__(self, n_jobs=-1, *, input_type='mol', on_errors='raise', return_typ Parameters ---------- n_jobs: int - The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). + The number of jobs to run in parallel for both fit and predict. + Can be -1 or # of cups. Set -1 to use all cpu cores (default). input_type: string Set the specific type of transform input. Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input. @@ -359,6 +370,7 @@ def __init__(self, n_jobs=-1, *, input_type='mol', on_errors='raise', return_typ self.input_type = input_type nms = [x[0] for x in Descriptors._descList] self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms) + self.__authors__ = ['Stephen Wu', 'TsumiNa'] def featurize(self, x): if self.input_type == 'smiles': @@ -391,7 +403,8 @@ def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, fp_size=2048, input_type Parameters ---------- n_jobs: int - The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). + The number of jobs to run in parallel for both fit and predict. + Can be -1 or # of cups. Set -1 to use all cpu cores (default). radius: int The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in ECFP/FCFP, diff --git a/xenonpy/descriptor/frozen_featurizer.py b/xenonpy/descriptor/frozen_featurizer.py index f7e711ba..36e3d83d 100644 --- a/xenonpy/descriptor/frozen_featurizer.py +++ b/xenonpy/descriptor/frozen_featurizer.py @@ -45,6 +45,7 @@ def __init__(self, model=None, *, cuda=False, depth=None, on_errors='raise', ret self.model = model self.cuda = cuda self._ret = [] + self.__authors__ = ['TsumiNa'] def featurize(self, descriptor, *, depth=None): if not isinstance(self.model, tc.nn.Module): diff --git a/xenonpy/descriptor/structure.py b/xenonpy/descriptor/structure.py index e8a4b2ef..b249f13c 100644 --- a/xenonpy/descriptor/structure.py +++ b/xenonpy/descriptor/structure.py @@ -119,6 +119,29 @@ def __init__(self, including_d=True, *, n_jobs=-1, on_errors='raise', return_typ """ super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type) self._including_d = including_d + self.__authors__ = ['TsumiNa'] + self.__citations__ = [ + ''' + @article{LamPham2017, + archivePrefix = {arXiv}, + arxivId = {1705.01043}, + author = {{Lam Pham}, Tien and Kino, Hiori and Terakura, Kiyoyuki and Miyake, Takashi and Tsuda, Koji and Takigawa, Ichigaku and {Chi Dam}, Hieu}, + doi = {10.1080/14686996.2017.1378060}, + eprint = {1705.01043}, + issn = {18785514}, + journal = {Science and Technology of Advanced Materials}, + keywords = {Material descriptor,data mining,machine learning,magnetic materials,material informatics}, + number = {1}, + pages = {756--765}, + pmid = {29152012}, + publisher = {Taylor {\&} Francis}, + title = {{Machine learning reveals orbital interaction in materials}}, + url = {https://doi.org/10.1080/14686996.2017.1378060}, + volume = {18}, + year = {2017} + } + ''' + ] @staticmethod def get_element_representation(name):