-
Notifications
You must be signed in to change notification settings - Fork 11
/
client.py
355 lines (290 loc) · 11.4 KB
/
client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import logging
import os
import tdclient
from .query_engine import HiveQueryEngine, PrestoQueryEngine, QueryEngine
from .table import Table
logger = logging.getLogger(__name__)
class Client(object):
"""Treasure Data client interface.
A client instance establishes a connection to Treasure Data. This interface
gives easy and efficient access to Presto/Hive query engine and Plazma
primary storage.
Parameters
----------
apikey : str, optional
Treasure Data API key. If not given, a value of environment variable
``TD_API_KEY`` is used by default.
endpoint : str, optional
Treasure Data API server. If not given, ``https://api.treasuredata.com`` is
used by default. List of available endpoints is:
https://docs.treasuredata.com/display/public/PD/Sites+and+Endpoints
database : str, default: 'sample_datasets'
Name of connected database.
default_engine : str, {'presto', 'hive'}, or \
:class:`pytd.query_engine.QueryEngine`, default: 'presto'
Query engine. If a QueryEngine instance is given, ``apikey``,
``endpoint``, and ``database`` are overwritten by the values configured
in the instance.
header : str or bool, default: `True`
Prepend comment strings, in the form "-- comment", as a header of queries.
Set False to disable header.
Attributes
----------
api_client : :class:`tdclient.Client`
Connection to Treasure Data.
query_executed : str or :class:`prestodb.client.PrestoResult`, default: `None`
Query execution result returned from DB-API Cursor object.
Examples
---------
Presto query executed via ``prestodb`` returns ``PrestoResult`` object:
>>> import pytd
>>> client = pytd.Client()
>>> client.query_executed
>>> client.query('select 1')
>>> client.query_executed
<prestodb.client.PrestoResult object at 0x10b9826a0>
Meanwhile, ``tdclient`` runs a job on Treasure Data, and Cursor returns
its job id:
>>> client.query('select 1', priority=0)
>>> client.query_executed
'669563342'
Note that the optional argument ``priority`` forces the client to query
via tdclient.
"""
def __init__(
self,
apikey=None,
endpoint=None,
database="sample_datasets",
default_engine="presto",
header=True,
**kwargs,
):
if isinstance(default_engine, QueryEngine):
apikey = default_engine.apikey
endpoint = default_engine.endpoint
database = default_engine.database
else:
apikey = apikey or os.environ.get("TD_API_KEY")
if apikey is None:
raise ValueError(
"either argument 'apikey' or environment variable"
"'TD_API_KEY' should be set"
)
if endpoint is None:
endpoint = os.getenv("TD_API_SERVER", "https://api.treasuredata.com")
default_engine = self._fetch_query_engine(
default_engine, apikey, endpoint, database, header
)
self.apikey = apikey
self.endpoint = endpoint
self.database = database
self.default_engine = default_engine
self.query_executed = None
self.api_client = tdclient.Client(
apikey=apikey,
endpoint=endpoint,
user_agent=default_engine.user_agent,
**kwargs,
)
def list_databases(self):
"""Get a list of td-client-python Database objects.
Returns
-------
list of :class:`tdclient.models.Database`
"""
return self.api_client.databases()
def list_tables(self, database=None):
"""Get a list of td-client-python Table objects.
Parameters
----------
database : string, optional
Database name. If not give, list tables in a table associated with
this :class:`pytd.Client` instance.
Returns
-------
list of :class:`tdclient.models.Table`
"""
if database is None:
database = self.database
return self.api_client.tables(database)
def list_jobs(self):
"""Get a list of td-client-python Job objects.
Returns
-------
list of :class:`tdclient.models.Job`
"""
return self.api_client.jobs()
def get_job(self, job_id):
"""Get a td-client-python Job object from ``job_id``.
Parameters
----------
job_id : integer
Job ID.
Returns
-------
:class:`tdclient.models.Job`
"""
return self.api_client.job(job_id)
def close(self):
"""Close a client I/O session to Treasure Data."""
self.default_engine.close()
self.api_client.close()
def query(self, query, engine=None, **kwargs):
"""Run query and get results.
Executed result stored in ``QueryEngine`` is retained in
``self.query_executed``.
Parameters
----------
query : str
Query issued on a specified query engine.
engine : str, {'presto', 'hive'}, or \
:class:`pytd.query_engine.QueryEngine`, optional
Query engine. If not given, default query engine created in the
constructor will be used.
**kwargs
Treasure Data-specific optional query parameters. Giving these
keyword arguments forces query engine to issue a query via Treasure
Data REST API provided by ``tdclient``; that is, if ``engine`` is
Presto, you cannot enjoy efficient direct access to the query
engine provided by ``prestodb``.
- ``db`` (str): use the database
- ``result_url`` (str): result output URL
- ``priority`` (int or str): priority
- -2: "VERY LOW"
- -1: "LOW"
- 0: "NORMAL"
- 1: "HIGH"
- 2: "VERY HIGH"
- ``retry_limit`` (int): max number of automatic retries
- ``wait_interval`` (int): sleep interval until job finish
- ``wait_callback`` (function): called every interval against job itself
- ``engine_version`` (str): run query with Hive 2 if this parameter
is set to ``"stable"`` and ``engine`` denotes Hive.
https://docs.treasuredata.com/display/public/PD/Writing+Hive+Queries
Meanwhile, when a following argument is set to ``True``, query is
deterministically issued via ``tdclient``.
- ``force_tdclient`` (bool): force Presto engines to issue a query
via ``tdclient`` rather than its default ``prestodb`` interface.
Returns
-------
dict : keys ('data', 'columns')
'data'
List of rows. Every single row is represented as a list of
column values.
'columns'
List of column names.
"""
self.query_executed = None
if isinstance(engine, QueryEngine):
pass # use the given QueryEngine instance
elif isinstance(engine, str):
if (
engine == "presto"
and isinstance(self.default_engine, PrestoQueryEngine)
) or (
engine == "hive" and isinstance(self.default_engine, HiveQueryEngine)
):
engine = self.default_engine
else:
engine = self._fetch_query_engine(
engine,
self.apikey,
self.endpoint,
self.database,
self.default_engine.header,
)
else:
engine = self.default_engine
header = engine.create_header("Client#query")
res = engine.execute(header + query, **kwargs)
self.query_executed = engine.executed
return res
def get_table(self, database, table):
"""Create a pytd table control instance.
Parameters
----------
database : str
Database name.
table : str
Table name.
Returns
-------
:class:`pytd.table.Table`
"""
return Table(self, database, table)
def exists(self, database, table=None):
"""Check if a database and table exists.
Parameters
----------
database : str
Database name.
table : str, optional
Table name. If not given, just check the database existence.
Returns
-------
bool
"""
try:
tbl = self.get_table(database, table)
except ValueError:
return False
if table is None:
return True
return tbl.exists
def create_database_if_not_exists(self, database):
"""Create a database on Treasure Data if it does not exist.
Parameters
----------
database : str
Database name.
"""
if self.exists(database):
logger.info(f"database `{database}` already exists")
else:
self.api_client.create_database(database)
logger.info(f"created database `{database}`")
def load_table_from_dataframe(
self, dataframe, destination, writer="bulk_import", if_exists="error", **kwargs
):
"""Write a given DataFrame to a Treasure Data table.
This function may initialize a Writer instance. Note that, as a part of
the initialization process for SparkWriter, the latest version of
td-spark will be downloaded.
Parameters
----------
dataframe : :class:`pandas.DataFrame`
Data loaded to a target table.
destination : str, or :class:`pytd.table.Table`
Target table.
writer : str, {'bulk_import', 'insert_into', 'spark'}, or \
:class:`pytd.writer.Writer`, default: 'bulk_import'
A Writer to choose writing method to Treasure Data. If not given or
string value, a temporal Writer instance will be created.
if_exists : str, {'error', 'overwrite', 'append', 'ignore'}, default: 'error'
What happens when a target table already exists.
- error: raise an exception.
- overwrite: drop it, recreate it, and insert data.
- append: insert data. Create if does not exist.
- ignore: do nothing.
"""
if isinstance(destination, str):
if "." in destination:
database, table = destination.split(".")
else:
database, table = self.database, destination
destination = self.get_table(database, table)
destination.import_dataframe(dataframe, writer, if_exists, **kwargs)
def __enter__(self):
return self
def __exit__(self, exception_type, exception_value, traceback):
self.close()
def _fetch_query_engine(self, engine, apikey, endpoint, database, header):
if engine == "presto":
return PrestoQueryEngine(apikey, endpoint, database, header)
elif engine == "hive":
return HiveQueryEngine(apikey, endpoint, database, header)
else:
raise ValueError(
'`engine` should be "presto" or "hive", or actual QueryEngine instance'
)