# OTA Demo
## Imports

In [None]:
import os

import pyspark.sql

## Configuration
### Paths

In [None]:
# environment archive created with conda pack
ARCHIVES_PATH = "hdfs:///user/metareview/ota_demo_support/ota_demo_env.tar.gz#cluster_venv"
# take the latest date form the S3 bucket
META_REVIEW_S3_URI = "s3a://trustyou-api/meta-review/2021-02-22_23-34-19/"
META_REVIEW_DUMP_PATH = "hdfs:///user/metareview/ota_demo/meta_review_dump.orc"

### Environment Variables

In [None]:
os.environ["PYSPARK_PYTHON"] = "./cluster_venv/bin/python"

## Create Spark Session

In [None]:
spark = pyspark.sql.SparkSession.builder.master("yarn") \
        .appName("OTA Demo") \
        .config("spark.executor.memory", "3g") \
        .config("spark.executor.cores", "3") \
        .config("spark.dynamicAllocation.enabled", "true") \
        .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
        .config("spark.dynamicAllocation.maxExecutors", "9") \
        .config("spark.sql.orc.filterPushdown", "false") \
        .config("spark.sql.shuffle.partitions", "400") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
        .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_KEY")) \
        .config("spark.yarn.dist.archives", ARCHIVES_PATH) \
        .getOrCreate()

## Extract Meta-Review Data
### Run the dump once then after that use the cache file
#### Dump from S3

In [None]:
meta_review_df = spark.read.json(META_REVIEW_S3_URI)

#### Write to HDFS cache file

In [None]:
meta_review_df.write.orc(META_REVIEW_DUMP_PATH)

## Read in the meta review from the HDFS cache file

In [None]:
meta_review_df = spark.read.orc(META_REVIEW_DUMP_PATH)

### Inferred Schema

In [None]:
meta_review_df.printSchema()

### Stats

In [None]:
print(f"Total number of records: {meta_review_df.count()}")

## Shutdown Spark Session

In [None]:
spark.stop()