Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
TsumiNa committed May 20, 2022
1 parent 4bc16f2 commit 597be59
Showing 1 changed file with 149 additions and 0 deletions.
149 changes: 149 additions & 0 deletions samples/process_dataset_usgs.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "4ef1599e",
"metadata": {},
"source": [
"## Obtain the compact csv file of USGS dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e316660c",
"metadata": {},
"outputs": [],
"source": [
"# python packages used in the sample codes\n",
"\n",
"import pandas as pd\n",
"import os\n",
"import numpy as np\n",
"import csv\n",
"import glob\n",
"import codecs\n",
"import cirpy"
]
},
{
"cell_type": "markdown",
"id": "5f32ca10",
"metadata": {},
"source": [
"#### Download spectrum data and CAS registry number and transform to SMILES"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "026675e0",
"metadata": {},
"outputs": [],
"source": [
"input_file = sorted(glob.glob(\"usgs_splib07/ASCIIdata/ASCIIdata_splib07a/ChapterO_OrganicCompounds/*AREF.txt\"))\n",
"smiles_list = []\n",
"spectrum_list = []\n",
"smiles_error_list = []\n",
"\n",
"for i, file_name in enumerate(input_file):\n",
" file_html = file_name.replace('txt', 'html')\\\n",
" .replace('ASCIIdata_splib07a/ChapterO_OrganicCompounds', '')\\\n",
" .replace('splib07a_', '')\\\n",
" .replace('ASCIIdata/', 'HTMLmetadata')\n",
" with codecs.open(file_html, 'r', 'utf-8', 'ignore') as fileobj:\n",
" lines = fileobj.readlines()\n",
" line_cas = [line for line in lines if 'CAS' in line]\n",
" cas = ''.join(line_cas) \n",
" cas = cas[7:]\n",
" if line_cas!=[]:\n",
" to_smiles = cirpy.resolve(cas, \"smiles\")\n",
" if to_smiles == None: \n",
" smiles_error_list.append(file_html)\n",
" elif to_smiles != None:\n",
" smiles_list = np.append(smiles_list,to_smiles)\n",
" spectrum_data = pd.read_table(file_name)\n",
" spectrum_data = spectrum_data.astype('float32')\n",
" spectrum_list = np.append(spectrum_list,spectrum_data)"
]
},
{
"cell_type": "markdown",
"id": "a2fac7a4",
"metadata": {},
"source": [
"#### Remove duplicate data and converting reflectance to absorbance"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "44fd683c",
"metadata": {},
"outputs": [],
"source": [
"spectrum_list = spectrum_list.reshape(len(smiles_list),-1)\n",
"spectrum_list = np.array(spectrum_list, dtype='float32')\n",
"\n",
"duplicate_list = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 21, 26, 28, 31, 32,\\\n",
" 33, 36, 37, 40, 42, 43, 48, 53, 55, 57, 59, 61, 63, 66,\\\n",
" 68, 70, 73, 75, 77, 79, 80, 82, 83, 84, 86, 87, 93, 96,\\\n",
" 97, 98, 104, 105, 106, 110, 111, 113, 117]\n",
"smiles_list_nodupli = np.delete(smiles_list,[duplicate_list])\n",
"spectrum_list_nodupli = np.delete(spectrum_list,[duplicate_list],0)\n",
"spectrum_list_nodupli = 2 - np.log10(spectrum_list_nodupli)\n",
"spectrum_list_nodupli = np.array(spectrum_list_nodupli, dtype='float32')"
]
},
{
"cell_type": "markdown",
"id": "60e3e847",
"metadata": {},
"source": [
"#### Make Dataset_USGS.csv"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1b0c37c4",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(spectrum_list_nodupli)\n",
"df.insert(0, 'SMILES', smiles_list_nodupli)\n",
"df = df.sample(frac=1,random_state=1)\n",
"df.to_csv('Dataset_USGS.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "edbbbbb4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 597be59

Please sign in to comment.