-
Notifications
You must be signed in to change notification settings - Fork 706
/
Copy pathtyping.py
273 lines (231 loc) · 10.9 KB
/
typing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
"""Module with parameter types."""
from __future__ import annotations
from typing import List, Literal, NamedTuple, Tuple, TypedDict
import pyarrow
from typing_extensions import NotRequired, Required
BucketingInfoTuple = Tuple[List[str], int]
class GlueTableSettings(TypedDict):
"""Typed dictionary defining the settings for the Glue table."""
table_type: NotRequired[Literal["EXTERNAL_TABLE"]]
"""The type of the Glue Table. Set to EXTERNAL_TABLE if None."""
description: NotRequired[str]
"""Glue/Athena catalog: Table description"""
parameters: NotRequired[dict[str, str]]
"""Glue/Athena catalog: Key/value pairs to tag the table."""
columns_comments: NotRequired[dict[str, str]]
"""
Columns names and the related comments
(e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}).
"""
columns_parameters: NotRequired[dict[str, dict[str, str]]]
"""
Columns names and the related parameters
(e.g. {'col0': {'par0': 'Param 0', 'par1': 'Param 1'}}).
"""
regular_partitions: NotRequired[bool]
"""
Create regular partitions (Non projected partitions) on Glue Catalog.
Disable when you will work only with Partition Projection.
Keep enabled even when working with projections is useful to keep
Redshift Spectrum working with the regular partitions.
"""
class AthenaCTASSettings(TypedDict):
"""Typed dictionary defining the settings for using CTAS (Create Table As Statement)."""
database: NotRequired[str]
"""
The name of the alternative database where the CTAS temporary table is stored.
If None, the default `database` is used.
"""
temp_table_name: NotRequired[str]
"""
The name of the temporary table and also the directory name on S3 where the CTAS result is stored.
If None, it will use the follow random pattern: `f"temp_table_{uuid.uuid4().hex()}"`.
On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`.
"""
bucketing_info: NotRequired[BucketingInfoTuple]
"""
Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the
second element.
Only `str`, `int` and `bool` are supported as column data types for bucketing.
"""
compression: NotRequired[str]
"""
Write compression for the temporary table where the CTAS result is stored.
Corresponds to the `write_compression` parameters for CREATE TABLE AS statement in Athena.
"""
class AthenaUNLOADSettings(TypedDict):
"""Typed dictionary defining the settings for using UNLOAD."""
file_format: NotRequired[str]
"""
Specifies the file format of the output. Only `PARQUET` is currently supported.
"""
compression: NotRequired[str]
"""
This option is specific to the ORC and Parquet formats.
For ORC, possible values are lz4, snappy, zlib, or zstd.
For Parquet, possible values are gzip or snappy. For ORC, the default is zlib, and for Parquet, the default is gzip.
"""
field_delimiter: NotRequired[str]
"""
Specifies a single-character field delimiter for files in CSV, TSV, and other text formats.
"""
partitioned_by: NotRequired[list[str]]
"""
A list of columns by which the output is partitioned.
"""
class AthenaCacheSettings(TypedDict):
"""Typed dictionary defining the settings for using cached Athena results."""
max_cache_seconds: NotRequired[int]
"""
awswrangler can look up in Athena's history if this table has been read before.
If so, and its completion time is less than `max_cache_seconds` before now, awswrangler
skips query execution and just returns the same results as last time.
"""
max_cache_query_inspections: NotRequired[int]
"""
Max number of queries that will be inspected from the history to try to find some result to reuse.
The bigger the number of inspection, the bigger will be the latency for not cached queries.
Only takes effect if max_cache_seconds > 0.
"""
max_remote_cache_entries: NotRequired[int]
"""
Max number of queries that will be retrieved from AWS for cache inspection.
The bigger the number of inspection, the bigger will be the latency for not cached queries.
Only takes effect if max_cache_seconds > 0 and default value is 50.
"""
max_local_cache_entries: NotRequired[int]
"""
Max number of queries for which metadata will be cached locally. This will reduce the latency and also
enables keeping more than `max_remote_cache_entries` available for the cache. This value should not be
smaller than max_remote_cache_entries.
Only takes effect if max_cache_seconds > 0 and default value is 100.
"""
class AthenaPartitionProjectionSettings(TypedDict):
"""
Typed dictionary defining the settings for Athena Partition Projection.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html
"""
projection_types: NotRequired[dict[str, Literal["enum", "integer", "date", "injected"]]]
"""
Dictionary of partitions names and Athena projections types.
Valid types: "enum", "integer", "date", "injected"
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
(e.g. {'col_name': 'enum', 'col2_name': 'integer'})
"""
projection_ranges: NotRequired[dict[str, str]]
"""
Dictionary of partitions names and Athena projections ranges.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
(e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'})
"""
projection_values: NotRequired[dict[str, str]]
"""
Dictionary of partitions names and Athena projections values.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
(e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'})
"""
projection_intervals: NotRequired[dict[str, str]]
"""
Dictionary of partitions names and Athena projections intervals.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
(e.g. {'col_name': '1', 'col2_name': '5'})
"""
projection_digits: NotRequired[dict[str, str]]
"""
Dictionary of partitions names and Athena projections digits.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
(e.g. {'col_name': '1', 'col2_name': '2'})
"""
projection_formats: NotRequired[dict[str, str]]
"""
Dictionary of partitions names and Athena projections formats.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
(e.g. {'col_date': 'yyyy-MM-dd', 'col2_timestamp': 'yyyy-MM-dd HH:mm:ss'})
"""
projection_storage_location_template: NotRequired[str]
"""
Value which is allows Athena to properly map partition values if the S3 file locations do not follow
a typical `.../column=value/...` pattern.
https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html
(e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/)
"""
class TimestreamBatchLoadReportS3Configuration(TypedDict):
"""
Report configuration for a batch load task. This contains details about where error reports are stored.
https://docs.aws.amazon.com/timestream/latest/developerguide/API_ReportS3Configuration.html
"""
BucketName: Required[str]
"""
The name of the bucket where the error reports are stored.
"""
ObjectKeyPrefix: NotRequired[str]
"""
Optional S3 prefix for the error reports.
"""
Encryption: NotRequired[Literal["SSE_S3", "SSE_KMS"]]
"""
Optional encryption type for the error reports. SSE_S3 by default.
"""
KmsKeyId: NotRequired[str]
"""
Optional KMS key ID for the error reports.
"""
class ArrowDecryptionConfiguration(TypedDict):
"""Configuration for Arrow file decrypting."""
crypto_factory: pyarrow.parquet.encryption.CryptoFactory
"""Crypto factory for encrypting and decrypting columns.
see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.encryption.CryptoFactory.html"""
kms_connection_config: pyarrow.parquet.encryption.KmsConnectionConfig
"""Configuration of the connection to the Key Management Service (KMS).
see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.encryption.KmsClient.html"""
class ArrowEncryptionConfiguration(TypedDict):
"""Configuration for Arrow file encrypting."""
crypto_factory: pyarrow.parquet.encryption.CryptoFactory
"""Crypto factory for encrypting and decrypting columns.
see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.encryption.CryptoFactory.html"""
kms_connection_config: pyarrow.parquet.encryption.KmsConnectionConfig
"""Configuration of the connection to the Key Management Service (KMS).
see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.encryption.KmsClient.html"""
encryption_config: pyarrow.parquet.encryption.EncryptionConfiguration
"""Configuration of the encryption, such as which columns to encrypt
see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.encryption.EncryptionConfiguration.html
"""
class RaySettings(TypedDict):
"""Typed dictionary defining the settings for distributing calls using Ray."""
parallelism: NotRequired[int]
"""
The requested parallelism of the read.
Parallelism may be limited by the number of files of the dataset.
Auto-detect by default.
"""
override_num_blocks: NotRequired[int]
"""
Override the number of output blocks from all read tasks.
By default, the number of output blocks is dynamically decided based on
input data size and available resources. You shouldn't manually set this
value in most cases.
"""
class RayReadParquetSettings(RaySettings):
"""Typed dictionary defining the settings for distributing reading calls using Ray."""
bulk_read: NotRequired[bool]
"""
True to enable a faster reading of a large number of Parquet files.
Offers improved performance due to not gathering the file metadata in a single node.
The drawback is that it does not offer schema resolution, so it should only be used when the
Parquet files are all uniform.
"""
class _S3WriteDataReturnValue(TypedDict):
"""Typed dictionary defining the dictionary returned by S3 write functions."""
paths: Required[list[str]]
"""List of all stored files paths on S3."""
partitions_values: Required[dict[str, list[str]]]
"""
Dictionary of partitions added with keys as S3 path locations
and values as a list of partitions values as str.
"""
class _ReadTableMetadataReturnValue(NamedTuple):
"""Named tuple defining the return value of the ``read_*_metadata`` functions."""
columns_types: dict[str, str]
"""Dictionary containing column names and types."""
partitions_types: dict[str, str] | None
"""Dictionary containing partition names and types, if partitioned."""