Add files via upload

yoshida-lab · May 20, 2022 · 597be59 · 597be59
1 parent 4bc16f2
commit 597be59
Showing 1 changed file with 149 additions and 0 deletions.
diff --git a/samples/process_dataset_usgs.ipynb b/samples/process_dataset_usgs.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4ef1599e",
+   "metadata": {},
+   "source": [
+    "## Obtain the compact csv file of USGS dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e316660c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# python packages used in the sample codes\n",
+    "\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import csv\n",
+    "import glob\n",
+    "import codecs\n",
+    "import cirpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f32ca10",
+   "metadata": {},
+   "source": [
+    "#### Download spectrum data and CAS registry number and transform to SMILES"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "026675e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_file = sorted(glob.glob(\"usgs_splib07/ASCIIdata/ASCIIdata_splib07a/ChapterO_OrganicCompounds/*AREF.txt\"))\n",
+    "smiles_list = []\n",
+    "spectrum_list = []\n",
+    "smiles_error_list = []\n",
+    "\n",
+    "for i, file_name in enumerate(input_file):\n",
+    "    file_html = file_name.replace('txt', 'html')\\\n",
+    "                         .replace('ASCIIdata_splib07a/ChapterO_OrganicCompounds', '')\\\n",
+    "                         .replace('splib07a_', '')\\\n",
+    "                         .replace('ASCIIdata/', 'HTMLmetadata')\n",
+    "    with codecs.open(file_html, 'r', 'utf-8', 'ignore') as fileobj:\n",
+    "        lines = fileobj.readlines()\n",
+    "        line_cas = [line for line in lines if 'CAS' in line]\n",
+    "        cas = ''.join(line_cas)                   \n",
+    "        cas = cas[7:]\n",
+    "        if line_cas!=[]:\n",
+    "            to_smiles = cirpy.resolve(cas, \"smiles\")\n",
+    "            if to_smiles == None: \n",
+    "                smiles_error_list.append(file_html)\n",
+    "            elif to_smiles != None:\n",
+    "                smiles_list = np.append(smiles_list,to_smiles)\n",
+    "                spectrum_data = pd.read_table(file_name)\n",
+    "                spectrum_data = spectrum_data.astype('float32')\n",
+    "                spectrum_list = np.append(spectrum_list,spectrum_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2fac7a4",
+   "metadata": {},
+   "source": [
+    "#### Remove duplicate data and converting reflectance to absorbance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "44fd683c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spectrum_list = spectrum_list.reshape(len(smiles_list),-1)\n",
+    "spectrum_list = np.array(spectrum_list, dtype='float32')\n",
+    "\n",
+    "duplicate_list = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 21, 26, 28, 31, 32,\\\n",
+    "                  33, 36, 37, 40, 42, 43, 48, 53, 55, 57, 59, 61, 63, 66,\\\n",
+    "                  68, 70, 73, 75, 77, 79, 80, 82, 83, 84, 86, 87, 93, 96,\\\n",
+    "                  97, 98, 104, 105, 106, 110, 111, 113, 117]\n",
+    "smiles_list_nodupli = np.delete(smiles_list,[duplicate_list])\n",
+    "spectrum_list_nodupli = np.delete(spectrum_list,[duplicate_list],0)\n",
+    "spectrum_list_nodupli = 2 - np.log10(spectrum_list_nodupli)\n",
+    "spectrum_list_nodupli = np.array(spectrum_list_nodupli, dtype='float32')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60e3e847",
+   "metadata": {},
+   "source": [
+    "#### Make Dataset_USGS.csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1b0c37c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(spectrum_list_nodupli)\n",
+    "df.insert(0, 'SMILES', smiles_list_nodupli)\n",
+    "df = df.sample(frac=1,random_state=1)\n",
+    "df.to_csv('Dataset_USGS.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edbbbbb4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}