## awswrangler

https://pypi.org/project/awswrangler/

### Quick Start

`pip install awswrangler`

In [1]:
import awswrangler as wr
import pandas as pd

In [2]:
!aws s3 ls

2020-09-06 16:34:18 aws-glue-scripts-578247465916-us-east-1
2020-09-06 16:34:18 aws-glue-temporary-578247465916-us-east-1
2020-02-29 11:48:57 elasticbeanstalk-us-east-1-578247465916
2020-07-26 00:24:21 wengong-cloud9
2020-02-08 15:18:43 wengong-lambda101
2020-08-25 22:29:42 wengong-redshift
2020-05-03 14:45:18 wengong-samocr
2020-05-03 15:46:39 wengong-samocrimg
2020-05-10 12:39:24 wildrydes-data-wengong
2020-02-26 12:01:36 wildrydes-wengong-backend
2020-02-26 12:01:10 wildrydes-wengong-frontend


#### Create Table in Athena

https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html

In [5]:
df = pd.DataFrame({"id": [1, 2], "value": ["foo", "boo"]})

In [9]:
# create below table in AWS Glue manually

bucket = "wengong-redshift"
database = "sampledb"
tablename = "table1"

s3path = f"s3://{bucket}/dataset/"

In [8]:
# Storing data on Data Lake
wr.s3.to_parquet(
    df=df,
    path=s3path,
    dataset=True,
    database=database,
    table=tablename
)

{'paths': ['s3://wengong-redshift/dataset/382d1ad4598546018e1982f722be213a.snappy.parquet'],
 'partitions_values': {}}

In [10]:
# Retrieving the data directly from Amazon S3
df = wr.s3.read_parquet(s3path, dataset=True)

In [11]:
df

Unnamed: 0,id,value
0,1,foo
1,2,boo
2,1,foo
3,2,boo


In [12]:
# Retrieving the data from Amazon Athena
df2 = wr.athena.read_sql_query(f"SELECT * FROM {tablename}", database=database)

In [13]:
df2

Unnamed: 0,id,value
0,1,foo
1,2,boo
2,1,foo
3,2,boo


### Read The Docs

https://aws-data-wrangler.readthedocs.io/