-
Notifications
You must be signed in to change notification settings - Fork 22
/
data_io.py
370 lines (306 loc) · 9.74 KB
/
data_io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/env python
# -*- coding: utf8 -*-
"""
The MetadataWizard(pymdwizard) software was developed by the
U.S. Geological Survey Fort Collins Science Center.
See: https://github.com/usgs/fort-pymdwizard for current project source code
See: https://usgs.github.io/fort-pymdwizard/ for current user documentation
See: https://github.com/usgs/fort-pymdwizard/tree/master/examples
for examples of use in other scripts
License: Creative Commons Attribution 4.0 International (CC BY 4.0)
http://creativecommons.org/licenses/by/4.0/
PURPOSE
------------------------------------------------------------------------------
Module for reading data from various formats into a Pandas dataframe
SCRIPT DEPENDENCIES
------------------------------------------------------------------------------
This script is part of the pymdwizard package and is not intented to be
used independently. All pymdwizard package requirements are needed.
See imports section for external packages used in this script as well as
inter-package dependencies
U.S. GEOLOGICAL SURVEY DISCLAIMER
------------------------------------------------------------------------------
This software has been approved for release by the U.S. Geological Survey
(USGS). Although the software has been subjected to rigorous review,
the USGS reserves the right to update the software as needed pursuant to
further analysis and review. No warranty, expressed or implied, is made by
the USGS or the U.S. Government as to the functionality of the software and
related material nor shall the fact of release constitute any such warranty.
Furthermore, the software is released on condition that neither the USGS nor
the U.S. Government shall be held liable for any damages resulting from
its authorized or unauthorized use.
Any use of trade, product or firm names is for descriptive purposes only and
does not imply endorsement by the U.S. Geological Survey.
Although this information product, for the most part, is in the public domain,
it also contains copyrighted material as noted in the text. Permission to
reproduce copyrighted items for other than personal use must be secured from
the copyright owner.
------------------------------------------------------------------------------
"""
import struct
import datetime
import decimal
try:
# Python 2
from itertools import izip
except ImportError:
# Python 3
izip = zip
try:
xrange
except NameError:
xrange = range
import pandas as pd
try:
import geopandas as gpd
import fiona
except:
gpd = None
fiona = None
from pymdwizard.core import utils
def read_csv(fname, delimiter=","):
"""
converts a csv, specified by filename, into a pandas dataframe
Parameters
----------
fname : string
Full fname to the csv to return
delimiter : str, optional, defaults to comma
the character used to delimit the data in a txt file
Returns
-------
pandas dataframe
"""
max_rows = int(utils.get_setting("maxrows", 1000000))
try:
df = pd.read_csv(
fname,
parse_dates=True,
delimiter=delimiter,
nrows=max_rows,
na_filter=False,
comment="#",
)
except UnicodeDecodeError:
try:
df = pd.read_csv(
fname,
parse_dates=True,
encoding="utf8",
delimiter=delimiter,
nrows=max_rows,
na_filter=False,
comment="#",
)
except UnicodeDecodeError:
df = pd.read_csv(
fname,
parse_dates=True,
encoding="ISO-8859-1",
delimiter=delimiter,
nrows=max_rows,
na_filter=False,
comment="#",
)
return df
def read_shp(fname):
"""
Returns a pandas dataframe of the attribute in a shapefile's dbf
specified as a file path/name
Parameters
----------
fname : str
file path/name to the shapefile being returned
Returns
-------
pandas dataframe
"""
df = gpd.read_file(fname)
c = fiona.open(fname)
list(c.schema["properties"].keys())
df = df[[c for c in df.columns if c != "geometry"]]
df.insert(0, "Shape", c.schema["geometry"])
df.insert(0, "FID", range(df.shape[0]))
return df
def dbfreader(f):
"""Returns an iterator over records in a Xbase DBF file.
The first row returned contains the field names.
The second row contains field specs: (type, size, decimal places).
Subsequent rows contain the data records.
If a record is marked as deleted, it is skipped.
File should be opened for binary reads.
originally taken from:
http://code.activestate.com/recipes/362715-dbf-reader-and-writer/
See DBF format spec at:
http://www.pgts.com.au/download/public/xbase.htm#DBF_STRUCT
"""
numrec, lenheader = struct.unpack("<xxxxLH22x", f.read(32))
numfields = (lenheader - 33) // 32
fields = []
for fieldno in xrange(numfields):
name, typ, size, deci = struct.unpack("<11sc4xBB14x", f.read(32))
name = bytes(name)
name = name.replace(b"\0", b"") # eliminate NULs from string
fields.append((name, typ, size, deci))
yield [field[0] for field in fields]
yield [tuple(field[1:]) for field in fields]
terminator = f.read(1)
assert terminator == b"\r"
fields.insert(0, ("DeletionFlag", "C", 1, 0))
fmt = "".join(["%ds" % fieldinfo[2] for fieldinfo in fields])
fmtsiz = struct.calcsize(fmt)
for i in xrange(numrec):
record = struct.unpack(fmt, f.read(fmtsiz))
if record[0] != b" ":
continue # deleted record
result = []
for (name, typ, size, deci), value in izip(fields, record):
value = bytes(value)
if name == "DeletionFlag":
continue
if typ == b"N":
value = value.replace(b"\0", b"").lstrip()
if value == "":
value = 0
elif deci:
value = decimal.Decimal(value)
else:
value = int(value)
if typ == b"C":
value = value.decode("utf-8")
elif typ == b"D":
y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8])
value = datetime.date(y, m, d)
elif typ == b"L":
value = (
(value in b"YyTt" and b"T") or (value in b"NnFf" and b"F") or b"?"
)
elif typ == b"F":
value = float(value)
result.append(value)
yield result
def read_dbf(fname):
"""
Returns a pandas dataframe of the dbf
specified as a file path/name
Parameters
----------
fname : str
file path/name to the dbf being returned
Returns
-------
pandas dataframe
"""
f = open(fname, "rb")
vat = list(dbfreader(f))
return pd.DataFrame(vat[2:], columns=[c.decode("utf-8") for c in vat[0]])
def get_sheet_names(fname):
"""
Returns list of sheets in an Excel file
Parameters
----------
fname : str
file path/name to the Excel file being returned
Returns
-------
list of strings
"""
import xlrd as xl
workbook = xl.open_workbook(fname)
return workbook.sheet_names()
def read_excel(fname, sheet_name):
"""
Returns a pandas dataframe of an Excel file and sheet
Parameters
----------
fname : str
file path/name to the Excel file
sheet_name : str
Returns
-------
pandas dataframe
"""
df = pd.read_excel(fname, sheet_name)
return df
def read_data(fname, sheet_name="", delimiter=","):
"""
Returns pandas dataframe from a file (csv, txt, Excel, or shp)
Parameters
----------
fname : str
file path/name to the Excel file
sheet_name : str, optional
sheet name
delimiter : str, optional
the character used to delimit the data in a txt file
Returns
-------
pandas dataframe
"""
if fname.lower().endswith(".csv"):
return read_csv(fname)
elif fname.lower().endswith(".txt"):
return read_csv(fname, delimiter)
elif fname.lower().endswith(".shp"):
return read_shp(fname)
elif sheet_name:
return read_excel(fname, sheet_name)
def sniff_nodata(series):
"""
Attempt to guess the nodata value associated with a series
Parameters
----------
series : pandas series
Returns
-------
str : the nodata placeholder in a series
"""
uniques = series.uniques()
for nd in [
"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"N/A",
"NA",
"NULL",
"NaN",
"n/a",
"nan",
"null",
-9999,
"-9999",
"",
"Nan",
]:
if nd in list(uniques):
return nd
return None
def clean_nodata(series, nodata=None):
"""
Given a series remove the values that match the specified nodata value
and convert it to an int or float if possible
Parameters
----------
series : pandas series
nodata : string, int, or float Nodata placeholder
Returns
-------
pandas series
"""
if nodata is None:
return series
clean_series = series[series != nodata]
try:
clean_series = clean_series.astype("int64")
except ValueError:
try:
clean_series = clean_series.astype("float64")
except ValueError:
pass
return clean_series