update

vangj · vangj · commit 09cafebe789d · 2022-04-01T18:38:17.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 .ipynb_checkpoints/
 .DS_Store
+mimic-iv-1.0
+mimic*
diff --git a/README.md b/README.md
@@ -1,3 +1,6 @@
 # csv2sqlite
 
 Jupyter notebook to show how to import CSV files into SQLite
+
+- [csv2sqlite.ipynb](csv2sqlite.ipynb): This notebook shows how to load small datasets into a SQLite database.
+- [csv2sqlite-streaming.ipynb](): This notebook shows how to load huge datasets into SQLite.
diff --git a/csv2sqlite-streaming.ipynb b/csv2sqlite-streaming.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a7175173-2fb2-4833-815a-15945392d3b3",
+   "metadata": {},
+   "source": [
+    "# CSV to SQLite, Streaming\n",
+    "\n",
+    "This notebook shows how to `crawl` a directory of `*.csv` files and import them into SQLite.\n",
+    "\n",
+    "In this example, we store `*.csv` files in a folder relative to the notebook `./data`. We then create a SQLite database from these CSV files.\n",
+    "\n",
+    "To customize this notebook, change the values of the 2 variables below.\n",
+    "\n",
+    "- `dir_to_csv`\n",
+    "- `sqlite_db`\n",
+    "\n",
+    "When you have a huge dataset, you should use this code."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e771f70a-4535-4763-87e5-7fe161865fb4",
+   "metadata": {},
+   "source": [
+    "## Create SQLite database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5287cd41-7680-4ffc-be4e-cbf1098b7a86",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-04-01T22:30:15.776608Z",
+     "iopub.status.busy": "2022-04-01T22:30:15.776608Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "processing mimic-iv-1.0\\admissions.csv\n",
+      "processing mimic-iv-1.0\\chartevents.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from io import StringIO\n",
+    "import pathlib\n",
+    "import sqlite3\n",
+    "import os\n",
+    "\n",
+    "def get_sample(file_path, max_lines=100):\n",
+    "    with open(file_path, 'r') as fp:\n",
+    "        lines = []\n",
+    "        \n",
+    "        for i, line in enumerate(fp):\n",
+    "            if i >= max_lines:\n",
+    "                break\n",
+    "            \n",
+    "            lines.append(line)\n",
+    "            \n",
+    "        return pd.read_csv(StringIO(''.join(lines)))\n",
+    "    \n",
+    "def get_create_sql(table, df):\n",
+    "    def get_type(t):\n",
+    "        if t.startswith('int') or t.startswith('bool'):\n",
+    "            return 'INTEGER'\n",
+    "        if t.startswith('float'):\n",
+    "            return 'REAL'\n",
+    "        return 'TEXT'\n",
+    "    \n",
+    "    fields = [(n, get_type(str(t))) for n, t in zip(df.dtypes.index, df.dtypes.values)]\n",
+    "    fields = [f'{n} {t}' for n, t in fields]\n",
+    "    fields = ', '.join(fields)\n",
+    "    \n",
+    "    sql = f'CREATE TABLE {table} ({fields})'\n",
+    "    return sql\n",
+    "\n",
+    "def get_headers(file_path):\n",
+    "    df = get_sample(file_path)\n",
+    "    columns = ','.join(df.columns)\n",
+    "    columns = f'{columns}\\n'\n",
+    "    return columns\n",
+    "\n",
+    "def get_csv_files(dir_path):\n",
+    "    csv_files = list(pathlib.Path(dir_path).glob('*.csv'))\n",
+    "    return csv_files\n",
+    "\n",
+    "def delete_file_if_exists(file_path):\n",
+    "    if os.path.exists(file_path):\n",
+    "        os.remove(file_path)\n",
+    "\n",
+    "def create_db(input_dir, output_db, flush_size=1_000):\n",
+    "    delete_file_if_exists(output_db)\n",
+    "    pathlib.Path(output_db).touch()\n",
+    "    \n",
+    "    files = get_csv_files(input_dir)\n",
+    "    \n",
+    "    with sqlite3.connect(output_db) as conn:\n",
+    "        cur = conn.cursor()\n",
+    "        \n",
+    "        for file_path in files:\n",
+    "            print(f'processing {file_path}')\n",
+    "            sql = get_create_sql(file_path.stem, get_sample(file_path))\n",
+    "            headers = get_headers(file_path)\n",
+    "            \n",
+    "            cur.execute(sql)\n",
+    "            \n",
+    "            with open(file_path, 'r') as fp:\n",
+    "                lines = []\n",
+    "                lines.append(headers)\n",
+    "                \n",
+    "                for i, line in enumerate(fp):\n",
+    "                    if i == 0:\n",
+    "                        continue\n",
+    "                    \n",
+    "                    lines.append(line)\n",
+    "                    \n",
+    "                    if len(lines) == flush_size:\n",
+    "                        df = pd.read_csv(StringIO(''.join(lines)))\n",
+    "                        df.to_sql(file_path.stem, conn, if_exists='append', index=False)\n",
+    "                        \n",
+    "                        lines = []\n",
+    "                        lines.append(headers)\n",
+    "                \n",
+    "                df = pd.read_csv(StringIO(''.join(lines)))\n",
+    "                df.to_sql(file_path.stem, conn, if_exists='append', index=False)\n",
+    "    \n",
+    "                \n",
+    "create_db('./mimic-iv-1.0', 'mimic.db')\n",
+    "print('done')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4717d6a4-619b-4644-85de-62e501bd79be",
+   "metadata": {},
+   "source": [
+    "## Verify that it works with the SQLite driver"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55291a90-d156-4ae5-8e3c-ea3e516c1019",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "with sqlite3.connect('mimic.db') as conn:\n",
+    "    sql = '''\n",
+    "    SELECT count(*) as total\n",
+    "    FROM admissions\n",
+    "    '''\n",
+    "    \n",
+    "    cur = conn.cursor()\n",
+    "    cur.execute(sql)\n",
+    "    \n",
+    "    items = cur.fetchall()\n",
+    "    for i in items:\n",
+    "        print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eabd857e-503b-463b-a756-701372558103",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/csv2sqlite.ipynb b/csv2sqlite.ipynb
@@ -98,7 +98,7 @@
    "id": "201ead59-0bd0-40ab-a6eb-dad9cf565ce6",
    "metadata": {},
    "source": [
-    "## Verify that it works with SQLite drive"
+    "## Verify that it works with the SQLite driver"
    ]
   },
   {

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@`
`98`	`98`	`"id": "201ead59-0bd0-40ab-a6eb-dad9cf565ce6",`
`99`	`99`	`"metadata": {},`
`100`	`100`	`"source": [`
`101`		`- "## Verify that it works with SQLite drive"`
	`101`	`+ "## Verify that it works with the SQLite driver"`
`102`	`102`	`]`
`103`	`103`	`},`
`104`	`104`	`{`