Skip to content

Commit

Permalink
inital commit
Browse files Browse the repository at this point in the history
  • Loading branch information
A139834 committed Jun 4, 2018
1 parent dddfdbe commit d330df8
Show file tree
Hide file tree
Showing 140 changed files with 22,312 additions and 1 deletion.
12 changes: 12 additions & 0 deletions .gitignore
@@ -0,0 +1,12 @@
target
project/target
spark-warehouse
**/.DS_Store
tmp
makefile
.idea/
project/project/
spark/
*.java-version
*~
derby.log
5 changes: 5 additions & 0 deletions CHANGELOG.md
@@ -0,0 +1,5 @@
## Change Log

# 1.0.0

- initial release
83 changes: 83 additions & 0 deletions Dockerfile
@@ -0,0 +1,83 @@
FROM openjdk:8u151-jdk-alpine3.7

# Spark Verison
ENV SPARK_VERSION 2.3.0
ENV HADOOP_VERSION 2.7
ENV SPARK_HOME /opt/spark
ENV SPARK_CHECKSUM_URL https://www.apache.org/dist/spark
ENV SPARK_DOWNLOAD_URL https://www-us.apache.org/dist/spark
ENV GLIBC_VERSION 2.26-r0

# do spark
RUN set -ex && \
apk upgrade --update && \
apk add --update libstdc++ curl ca-certificates bash openblas && \
for pkg in glibc-${GLIBC_VERSION} glibc-bin-${GLIBC_VERSION} glibc-i18n-${GLIBC_VERSION}; do curl -sSL https://github.com/andyshinn/alpine-pkg-glibc/releases/download/${GLIBC_VERSION}/${pkg}.apk -o /tmp/${pkg}.apk; done && \
apk add --allow-untrusted /tmp/*.apk && \
rm -v /tmp/*.apk && \
( /usr/glibc-compat/bin/localedef --force --inputfile POSIX --charmap UTF-8 C.UTF-8 || true ) && \
echo "export LANG=C.UTF-8" > /etc/profile.d/locale.sh && \
/usr/glibc-compat/sbin/ldconfig /lib /usr/glibc-compat/lib && \
mkdir -p ${SPARK_HOME} && \
curl --show-error --location --output spark.sha \
${SPARK_CHECKSUM_URL}/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz.sha512 && \
export SPARK_SHA512_SUM=$(grep -o "[A-F0-9]\{8\}" spark.sha | awk '{print}' ORS='' | tr '[:upper:]' '[:lower:]') && \
rm -f spark.sha && \
curl --show-error --location --output spark.tar.gz ${SPARK_DOWNLOAD_URL}/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
echo "${SPARK_SHA512_SUM} spark.tar.gz" | sha512sum -c - && \
gunzip -c spark.tar.gz | tar -xf - -C $SPARK_HOME --strip-components=1 && \
rm -f spark.tar.gz

# spark extensions
RUN curl --show-error --location --output ${SPARK_HOME}/jars/spark-xml_2.11-0.4.1.jar \
https://repo.maven.apache.org/maven2/com/databricks/spark-xml_2.11/0.4.1/spark-xml_2.11-0.4.1.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/spark-avro_2.11-3.2.0.jar \
https://repo.maven.apache.org/maven2/com/databricks/spark-avro_2.11/4.0.0/spark-avro_2.11-4.0.0.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/spark-redshift_2.11-3.0.0-preview1.jar \
https://repo.maven.apache.org/maven2/com/databricks/spark-redshift_2.11/3.0.0-preview1/spark-redshift_2.11-3.0.0-preview1.jar && \
# aws hadoop
curl --show-error --location --output ${SPARK_HOME}/jars/hadoop-aws-2.7.4.jar \
https://repo.maven.apache.org/maven2/org/apache/hadoop/hadoop-aws/2.7.4/hadoop-aws-2.7.4.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/aws-java-sdk-1.7.4.jar \
https://repo.maven.apache.org/maven2/com/amazonaws/aws-java-sdk/1.11.197/aws-java-sdk-1.11.197.jar && \
# azure hadoop
curl --show-error --location --output ${SPARK_HOME}/jars/hadoop-azure-2.7.4.jar \
https://repo.maven.apache.org/maven2/org/apache/hadoop/hadoop-azure/2.7.4/hadoop-azure-2.7.4.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/azure-storage-3.1.0.jar \
https://repo.maven.apache.org/maven2/com/microsoft/azure/azure-storage/3.1.0/azure-storage-3.1.0.jar && \
# azure eventhub
curl --show-error --location --output ${SPARK_HOME}/jars/azure-eventhubs-1.0.1.jar \
https://repo.maven.apache.org/maven2/com/microsoft/azure/azure-eventhubs/1.0.1/azure-eventhubs-1.0.1.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/proton-j-0.26.0.jar \
https://repo.maven.apache.org/maven2/org/apache/qpid/proton-j/0.26.0/proton-j-0.26.0.jar && \
# databases
curl --show-error --location --output ${SPARK_HOME}/jars/mssql-jdbc-6.1.1.jre8.jar \
https://github.com/Microsoft/mssql-jdbc/releases/download/v6.1.1/mssql-jdbc-6.1.1.jre8.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/postgresql-42.2.2.jar \
https://jdbc.postgresql.org/download/postgresql-42.2.2.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/spark-cassandra-connector_2.11-2.0.5.jar \
https://repo.maven.apache.org/maven2/com/datastax/spark/spark-cassandra-connector_2.11/2.0.5/spark-cassandra-connector_2.11-2.0.5.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/mysql-connector-java-5.1.45.jar \
https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/5.1.45/mysql-connector-java-5.1.45.jar && \
#geospark
curl --show-error --location --output ${SPARK_HOME}/jars/geospark-1.1.1.jar \
https://repo.maven.apache.org/maven2/org/datasyslab/geospark/1.1.1/geospark-1.1.1.jar && \
curl --show-error --location --output ${SPARK_HOME}/jars/geospark-sql_2.3-1.1.1.jar \
https://repo.maven.apache.org/maven2/org/datasyslab/geospark-sql_2.3/1.1.1/geospark-sql_2.3-1.1.1.jar && \
apk del curl

# copy in etl library
COPY target/scala-2.11/arc.jar /opt/spark/jars/arc.jar

# copy in tutorial
COPY tutorial /opt/tutorial
RUN chmod +x /opt/tutorial/nyctaxi/download_raw_data_small.sh
RUN chmod +x /opt/tutorial/nyctaxi/download_raw_data_large.sh

# copy in log4j.properties config file
COPY log4j.properties /opt/spark/conf/log4j.properties

WORKDIR $SPARK_HOME
# EOF


161 changes: 160 additions & 1 deletion README.md
@@ -1 +1,160 @@
# arc
Arc is an opinionated framework for defining data pipelines which are predictable, repeatable and manageable.

- [What is Arc?](#what-is-spark-etl-pipeline)
- [Principles](#principles)
- [Not just for data engineers](#not-just-for-data-engineers)
- [Why abstract from code?](#why-abstract-from-code)
- [Why SQL first?](#why-sql-first)
- [Example pipeline](#example-pipeline)
- [Contributing](#contributing)
- [License](#license)

## What is Arc?

Arc is an **opinionated** framework for defining **predictable**, **repeatable** and **manageable** data transformation pipelines for data transformation tasks;

- **predictable** in that data is used to define transformations - not code.
- **repeatable** in that if a job is executed multiple times it will produce the same result.
- **manageable** in that execution considerations and logging has been baked in from the start.

## Principles

Many of these principles have come from [12factor](https://12factor.net/):

- **[single responsibility](https://en.wikipedia.org/wiki/Single_responsibility_principle)** components/stages.
- **stateless** jobs where possible and use of [immutable](https://en.wikipedia.org/wiki/Immutable_object) datasets.
- **precise logging** to allow management of jobs at scale.
- **library dependencies** are to be limited or avoided where possible.

## Not just for data engineers

The intent of the pipeline is to provide a simple way of creating Extract-Transform-Load (ETL) pipelines which are able to be maintained in production, and captures the answers to simple operational questions transparently to the user:

- **monitoring**: is it working each time it's run? and how much resource was consumed in creating it?

These concerns are supported at run time to ensure that as deployment grows in uses and complexity it does not become opaque and unmanageable.

## Why abstract from code?

From experience a very high proportion of data pipelines perform very similar extract, transform and load actions on datasets. Unfortunately, whilst the desired outcomes are largely similar, the implementations are vastly varied resulting in higher maintenance costs, lower test-coverage and high levels of rework.

The intention of this project is to define and implement an **opinionated** standard approach for declaring data pipelines which is open and extensible. Abstraction from underlying code allows rapid deployment, a consistent way of defining transformation tasks (such as data typing) and allows abstraction of the pipeline definition from the pipeline execution (to support changing of the underlying execution engines) - see [declarative programming](https://en.wikipedia.org/wiki/Declarative_programming).

Currently it is tightly coupled to [Apache Spark](https://spark.apache.org) due to its fault-tolerance, performance and solid API for standard data engineering tasks but the definitions are human and machine readable [HOCON](https://github.com/lightbend/config/blob/master/HOCON.md) (a JSON derivative) allowing the transformation definitions to be implemented against future execution engines.

## Why SQL first?

SQL first (copied from the Mobile First UX principle) is an approach where, if possible, transformations are done using Structured Query Language (SQL) as a preference. This is because SQL is a very good way of expressing standard data transformation intent in a [declarative](https://en.wikipedia.org/wiki/Declarative_programming) way. SQL is so widely known and taught that finding people who are able to understand the business context and able to write basic SQL is much easier than finding a Scala developer who also understands the business context (for example).

Currently the [HIVE](https://cwiki.apache.org/confluence/display/Hive/LanguageManual) dialect of SQL is supported as [Spark SQL](https://spark.apache.org/docs/latest/sql-programming-guide.html) uses the same SQL dialect and has a lot of the same [functions](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$) that would be expected from other SQL dialects. This could change in the future.

## Example pipeline

This is an example of a fairly standard pipeline:

1. First load a set of CSV files from an input directory. Separator is a comma and the file does not have a header.

2. Convert the data to the correct datatypes using metadata defined in a separate JSON.

3. Execute a SQL statement that will perform custom validation to ensure the data conversion in the previous step resulted in an acceptable data conversion error rate.

4. Calculate some aggregates using a SQL Transformation substituting the `${year}` variable with the value `2016`.

5. Write out the aggreate resultset to a Parquet target.

```hocon
{
"stages": [
{
"type": "DelimitedExtract",
"name": "extract data from green_tripdata/0",
"environments": ["production", "test"],
"inputURI": ${ETL_CONF_BASE_URL}"/data/green_tripdata/0/*.csv",
"outputView": "green_tripdata0_raw",
"persist": false,
"delimiter": "Comma",
"quote" : "DoubleQuote",
"header": true,
"authentication": {
},
"params": {
}
},
{
"type": "TypingTransform",
"name": "apply green_tripdata/0 data types",
"environments": ["production", "test"],
"inputURI": "/opt/tutorial/nyctaxi/meta/green_tripdata/0/green_tripdata.json",
"inputView": "green_tripdata0_raw",
"outputView": "green_tripdata0",
"persist": true,
"authentication": {
},
"params": {
}
},
{
"type": "SQLValidate",
"name": "ensure no errors exist after data typing",
"environments": ["production", "test"],
"inputURI": "/opt/tutorial/nyctaxi/job/3/sqlvalidate_errors.sql",
"sqlParams": {
"table_name": "green_tripdata0"
},
"authentication": {
},
"params": {
}
},
{
"type": "SQLTransform",
"name": "merge *tripdata to create a full trips",
"environments": ["production", "test"],
"inputURI": "/opt/tutorial/nyctaxi/job/3/trips.sql",
"outputView": "trips",
"persist": true,
"authentication": {
},
"sqlParams": {
"year": "2016"
},
"params": {
}
},
{
"type": "ParquetLoad",
"name": "write trips back to filesystem",
"environments": ["production", "test"],
"inputView": "trips",
"outputURI": ${ETL_CONF_BASE_URL}"/data/output/trips.parquet",
"numPartitions": 100,
"partitionBy": [
"vendor_id"
],
"authentication": {
},
"params": {}
}
]
}
```

## Contributing

If you have suggestions of additional components or find issues that you believe need fixing then please raise an issue. An issue with a test case is even more appreciated.

When you contribute code, you affirm that the contribution is your original work and that you license the work to the project under the project’s open source license. Whether or not you state this explicitly, by submitting any copyrighted material via pull request, email, or other means you agree to license the material under the project’s open source license and warrant that you have the legal authority to do so.

## Attribution

Thanks to the following projects:

- [Apache Spark](https://spark.apache.org/) for the underlying framework that has made this library possible.
- [slf4j-json-logger](https://github.com/savoirtech/slf4j-json-logger) Copyright (c) 2016 Savoir Technologies released under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0). We have slightly altered their library to change the default logging format.
- [azure-sqldb-spark](https://github.com/Azure/azure-sqldb-spark) for their Microsoft SQL Server bulkload driver. Currently included in /lib but will be pulled from Maven once available.
- [nyc-taxi-data](https://github.com/toddwschneider/nyc-taxi-data) for preparing an easy to use set of real-world data for the tutorial.

## License

Arc is released under the [MIT License](https://opensource.org/licenses/MIT).

36 changes: 36 additions & 0 deletions build.sbt
@@ -0,0 +1,36 @@
import Dependencies._

lazy val root = (project in file(".")).
enablePlugins(BuildInfoPlugin).
settings(
name := "arc",
organization := "au.com.agl",
scalaVersion := "2.11.8",
scalastyleFailOnError := false,
libraryDependencies ++= etlDeps,
parallelExecution in Test := false,
buildInfoKeys := Seq[BuildInfoKey](version, scalaVersion),
buildInfoPackage := "au.com.agl.arc"
)

test in assembly := {}

assemblyJarName in assembly := s"${name.value}.jar"

scalacOptions := Seq("-unchecked", "-deprecation")

// META-INF discarding
assemblyMergeStrategy in assembly := {
{
case PathList("META-INF", xs @ _*) => MergeStrategy.discard
case x => MergeStrategy.first
}
}

// allow netty for spark (old) and netty (less old) for tensorflow grpc calls to co-exist
assemblyShadeRules in assembly := Seq(
ShadeRule.rename("com.google.protobuf.**" -> "shadeio.@1").inAll,
ShadeRule.rename("com.google.guava.**" -> "shadeio.@1").inAll,
ShadeRule.rename("com.google.common.**" -> "shadeio.@1").inAll,
ShadeRule.rename("io.netty.**" -> "shadeio.@1").inAll
)
Binary file added lib/azure-sqldb-spark-1.0.0.jar
Binary file not shown.
16 changes: 16 additions & 0 deletions log4j.properties
@@ -0,0 +1,16 @@
# base logger
log4j.rootCategory=INFO,console

# console appender
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%m%n

# spark logging
log4j.logger.org=ERROR
log4j.logger.akka=ERROR
log4j.logger.spark=ERROR

# framework logging
log4j.logger.etl=INFO

0 comments on commit d330df8

Please sign in to comment.