Skip to content

Commit 09cafeb

Browse files
committed
update
1 parent 98bcf5a commit 09cafeb

File tree

4 files changed

+214
-1
lines changed

4 files changed

+214
-1
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
.ipynb_checkpoints/
22
.DS_Store
3+
mimic-iv-1.0
4+
mimic*

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
# csv2sqlite
22

33
Jupyter notebook to show how to import CSV files into SQLite
4+
5+
- [csv2sqlite.ipynb](csv2sqlite.ipynb): This notebook shows how to load small datasets into a SQLite database.
6+
- [csv2sqlite-streaming.ipynb](): This notebook shows how to load huge datasets into SQLite.

csv2sqlite-streaming.ipynb

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "a7175173-2fb2-4833-815a-15945392d3b3",
6+
"metadata": {},
7+
"source": [
8+
"# CSV to SQLite, Streaming\n",
9+
"\n",
10+
"This notebook shows how to `crawl` a directory of `*.csv` files and import them into SQLite.\n",
11+
"\n",
12+
"In this example, we store `*.csv` files in a folder relative to the notebook `./data`. We then create a SQLite database from these CSV files.\n",
13+
"\n",
14+
"To customize this notebook, change the values of the 2 variables below.\n",
15+
"\n",
16+
"- `dir_to_csv`\n",
17+
"- `sqlite_db`\n",
18+
"\n",
19+
"When you have a huge dataset, you should use this code."
20+
]
21+
},
22+
{
23+
"cell_type": "markdown",
24+
"id": "e771f70a-4535-4763-87e5-7fe161865fb4",
25+
"metadata": {},
26+
"source": [
27+
"## Create SQLite database"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": null,
33+
"id": "5287cd41-7680-4ffc-be4e-cbf1098b7a86",
34+
"metadata": {
35+
"execution": {
36+
"iopub.execute_input": "2022-04-01T22:30:15.776608Z",
37+
"iopub.status.busy": "2022-04-01T22:30:15.776608Z"
38+
},
39+
"tags": []
40+
},
41+
"outputs": [
42+
{
43+
"name": "stdout",
44+
"output_type": "stream",
45+
"text": [
46+
"processing mimic-iv-1.0\\admissions.csv\n",
47+
"processing mimic-iv-1.0\\chartevents.csv\n"
48+
]
49+
}
50+
],
51+
"source": [
52+
"import pandas as pd\n",
53+
"from io import StringIO\n",
54+
"import pathlib\n",
55+
"import sqlite3\n",
56+
"import os\n",
57+
"\n",
58+
"def get_sample(file_path, max_lines=100):\n",
59+
" with open(file_path, 'r') as fp:\n",
60+
" lines = []\n",
61+
" \n",
62+
" for i, line in enumerate(fp):\n",
63+
" if i >= max_lines:\n",
64+
" break\n",
65+
" \n",
66+
" lines.append(line)\n",
67+
" \n",
68+
" return pd.read_csv(StringIO(''.join(lines)))\n",
69+
" \n",
70+
"def get_create_sql(table, df):\n",
71+
" def get_type(t):\n",
72+
" if t.startswith('int') or t.startswith('bool'):\n",
73+
" return 'INTEGER'\n",
74+
" if t.startswith('float'):\n",
75+
" return 'REAL'\n",
76+
" return 'TEXT'\n",
77+
" \n",
78+
" fields = [(n, get_type(str(t))) for n, t in zip(df.dtypes.index, df.dtypes.values)]\n",
79+
" fields = [f'{n} {t}' for n, t in fields]\n",
80+
" fields = ', '.join(fields)\n",
81+
" \n",
82+
" sql = f'CREATE TABLE {table} ({fields})'\n",
83+
" return sql\n",
84+
"\n",
85+
"def get_headers(file_path):\n",
86+
" df = get_sample(file_path)\n",
87+
" columns = ','.join(df.columns)\n",
88+
" columns = f'{columns}\\n'\n",
89+
" return columns\n",
90+
"\n",
91+
"def get_csv_files(dir_path):\n",
92+
" csv_files = list(pathlib.Path(dir_path).glob('*.csv'))\n",
93+
" return csv_files\n",
94+
"\n",
95+
"def delete_file_if_exists(file_path):\n",
96+
" if os.path.exists(file_path):\n",
97+
" os.remove(file_path)\n",
98+
"\n",
99+
"def create_db(input_dir, output_db, flush_size=1_000):\n",
100+
" delete_file_if_exists(output_db)\n",
101+
" pathlib.Path(output_db).touch()\n",
102+
" \n",
103+
" files = get_csv_files(input_dir)\n",
104+
" \n",
105+
" with sqlite3.connect(output_db) as conn:\n",
106+
" cur = conn.cursor()\n",
107+
" \n",
108+
" for file_path in files:\n",
109+
" print(f'processing {file_path}')\n",
110+
" sql = get_create_sql(file_path.stem, get_sample(file_path))\n",
111+
" headers = get_headers(file_path)\n",
112+
" \n",
113+
" cur.execute(sql)\n",
114+
" \n",
115+
" with open(file_path, 'r') as fp:\n",
116+
" lines = []\n",
117+
" lines.append(headers)\n",
118+
" \n",
119+
" for i, line in enumerate(fp):\n",
120+
" if i == 0:\n",
121+
" continue\n",
122+
" \n",
123+
" lines.append(line)\n",
124+
" \n",
125+
" if len(lines) == flush_size:\n",
126+
" df = pd.read_csv(StringIO(''.join(lines)))\n",
127+
" df.to_sql(file_path.stem, conn, if_exists='append', index=False)\n",
128+
" \n",
129+
" lines = []\n",
130+
" lines.append(headers)\n",
131+
" \n",
132+
" df = pd.read_csv(StringIO(''.join(lines)))\n",
133+
" df.to_sql(file_path.stem, conn, if_exists='append', index=False)\n",
134+
" \n",
135+
" \n",
136+
"create_db('./mimic-iv-1.0', 'mimic.db')\n",
137+
"print('done')"
138+
]
139+
},
140+
{
141+
"cell_type": "markdown",
142+
"id": "4717d6a4-619b-4644-85de-62e501bd79be",
143+
"metadata": {},
144+
"source": [
145+
"## Verify that it works with the SQLite driver"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": null,
151+
"id": "55291a90-d156-4ae5-8e3c-ea3e516c1019",
152+
"metadata": {
153+
"tags": []
154+
},
155+
"outputs": [],
156+
"source": [
157+
"with sqlite3.connect('mimic.db') as conn:\n",
158+
" sql = '''\n",
159+
" SELECT count(*) as total\n",
160+
" FROM admissions\n",
161+
" '''\n",
162+
" \n",
163+
" cur = conn.cursor()\n",
164+
" cur.execute(sql)\n",
165+
" \n",
166+
" items = cur.fetchall()\n",
167+
" for i in items:\n",
168+
" print(i)"
169+
]
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": null,
174+
"id": "eabd857e-503b-463b-a756-701372558103",
175+
"metadata": {},
176+
"outputs": [],
177+
"source": []
178+
}
179+
],
180+
"metadata": {
181+
"kernelspec": {
182+
"display_name": "Python 3",
183+
"language": "python",
184+
"name": "python3"
185+
},
186+
"language_info": {
187+
"codemirror_mode": {
188+
"name": "ipython",
189+
"version": 3
190+
},
191+
"file_extension": ".py",
192+
"mimetype": "text/x-python",
193+
"name": "python",
194+
"nbconvert_exporter": "python",
195+
"pygments_lexer": "ipython3",
196+
"version": "3.8.5"
197+
},
198+
"widgets": {
199+
"application/vnd.jupyter.widget-state+json": {
200+
"state": {},
201+
"version_major": 2,
202+
"version_minor": 0
203+
}
204+
}
205+
},
206+
"nbformat": 4,
207+
"nbformat_minor": 5
208+
}

csv2sqlite.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
"id": "201ead59-0bd0-40ab-a6eb-dad9cf565ce6",
9999
"metadata": {},
100100
"source": [
101-
"## Verify that it works with SQLite drive"
101+
"## Verify that it works with the SQLite driver"
102102
]
103103
},
104104
{

0 commit comments

Comments
 (0)