## Get the journey data.
Clones the tpcds journey repository to get access to all the data and scripts that are required to excercise this journey. Normally the data and queries are generated by running the data and query generation utility from the tpcds toolkit available at http://www.tpc.org/tpcds. However for easy of use, the data and queries are pre-generated for 1GB scale factor. We use the pre-generated data and queries to demonstrate how they can be used to run the tpcds queries against spark.


In [1]:
import sys.process._
"rm -rf tpcds-journey" !
"git --version" !
"git clone --progress https://github.com/SparkTC/tpcds-journey.git" !

git version 1.8.3.1
Cloning into 'tpcds-journey'...
remote: Counting objects: 991, done.        
remote: Compressing objects: 100% (106/106), done.        % (102/106)           
remote: Total 991 (delta 87), reused 97 (delta 41), pack-reused 843        
Receiving objects: 100% (991/991), 362.47 MiB | 16.63 MiB/s, done.
Resolving deltas: 100% (396/396), done.
Checking out files: 100% (806/806), done.


## Setup variables.
* Sets up variables that are used in the rest of this notebook.
* The path variables are relative to the git clone directory.
* tpcdsDatabaseName is hard-coded to "TPCDS1G". This can be changed if a different database name is desired.

In [2]:
def deleteFile1(tableName: String): Unit = {
    import sys.process._
    val commandStr1 = s"rm -rf spark-warehouse/tpcds2g.db/${tableName}/*"
    val commandStr2 = s"rm -rf spark-warehouse/tpcds2g.db/${tableName}"
    commandStr1 !
}

In [3]:
def deleteFile2(tableName: String): Unit = {
    import sys.process._
    val commandStr2 = s"rm -rf spark-warehouse/tpcds2g.db/${tableName}"
    commandStr2 !
}

In [4]:
val tpcdsRootDir = "tpcds-journey"
val tpcdsDdlDir = s"${tpcdsRootDir}/src/ddl/individual"
val tpcdsGenDataDir = s"${tpcdsRootDir}/src/data"
val tpcdsQueriesDir = s"${tpcdsRootDir}/src/queries"
val tpcdsDatabaseName = "TPCDS2G"
var totalTime: Long = 0
println("TPCDS root directory is at : "+ tpcdsRootDir)
println("TPCDS ddl scripts directory is at: " + tpcdsDdlDir)
println("TPCDS data directory is at: "+ tpcdsGenDataDir)
println("TPCDS queries directory is at: "+ tpcdsQueriesDir)

// val conf = sc.getConf
// println(conf.toDebugString)
val journey_spark = SparkSession.
    builder().
    config("spark.ui.showConsoleProgress", false).
    config("spark.ui.enabled", false).
    config("spark.ui.retainedJobs", 1).
    config("spark.ui.retainedStages", 1).
    config("spark.ui.retainedTasks", 1).
    config("spark.sql.autoBroadcastJoinThreshold", -1).
    config("spark.sql.crossJoin.enabled", true).
    config("spark.storage.memoryFraction", "0.1").
    config("spark.dynamicAllocation.enabled", "true").
    config("spark.executor.instances", 4).
    config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").
    getOrCreate()

journey_spark.sparkContext.setLogLevel("ERROR")

TPCDS root directory is at : tpcds-journey
TPCDS ddl scripts directory is at: tpcds-journey/src/ddl/individual
TPCDS data directory is at: tpcds-journey/src/data
TPCDS queries directory is at: tpcds-journey/src/queries


## Setup the TPC-DS schema
* Creates the database as specified by tpcdsDatabaseName
* Creates all the tpc-ds tables.
* Loads data into the tables in parquet format. 
  * Since the data generated by tpc-ds toolkit is in CSV format, we do the loading in multi steps.
  * As first step, we create tables in csv format by pointing the location to the generated data.
  * As second step, we create parquet tables by using CTAS that convert text data into parquet.
  * As last step, we drop the text tables as we longer need them.

### Utility function definitions.
* Defines the utility functions that are called from the cells below in the notebook.

In [5]:
def createDatabase(): Unit = {
    journey_spark.sql(s"DROP DATABASE IF EXISTS ${tpcdsDatabaseName} CASCADE")
    journey_spark.sql(s"CREATE DATABASE ${tpcdsDatabaseName}")
    journey_spark.sql(s"USE ${tpcdsDatabaseName}")
}

/**
 * Function to create a table in spark. It reads the DDL script for each of the
 * tpc-ds table and executes it on Spark.
 */
def createTable(tableName: String): Unit = {
  println(s"Creating table $tableName ..")
  journey_spark.sql(s"DROP TABLE IF EXISTS $tableName")
  deleteFile1(tableName)   
  deleteFile2(tableName)    
  val (fileName, content) = 
    journey_spark.sparkContext.wholeTextFiles(s"${tpcdsDdlDir}/$tableName.sql").collect()(0) 
    
  // Remove the replace for the .dat once it is fixed in the github repo
  val sqlStmts = content.stripLineEnd
    .replace('\n', ' ')
    .replace("${TPCDS_GENDATA_DIR}", tpcdsGenDataDir)
    .replace("csv", "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat").split(";")
  sqlStmts.map(stmt => journey_spark.sql(stmt))    
}  

def runQuery(queryNum: Int): Unit = {
  val queryStr = "%02d".format(queryNum)  
  val queryName = s"${tpcdsQueriesDir}/query${queryStr}.sql"   
  val (_, content) = journey_spark.sparkContext.wholeTextFiles(queryName).collect()(0)  
  val queries = content.split("\n")
    .filterNot (_.startsWith("--"))
    .mkString(" ").split(";")
  println(s"Running TPC-DS Query $queryName")
  for (query <- queries)  {
   val start = System.nanoTime()
   val result = journey_spark.sql(query).collect  
   val timeElapsed = (System.nanoTime() - start) / 1000000000
   totalTime = totalTime + timeElapsed
   // result.foreach(println) 
   println(s"Time elapsed: $timeElapsed, Number of rows: ${result.length}") 
  }
}

def runAllQueries(): Unit = {
    
}

// run function for each table in tables array
def forEachTable(tables: Array[String], f: (String) => Unit): Unit = {
  for ( table <- tables) {
    try {
      f(table)
    } catch {
      case e: Throwable => {
        println("EXCEPTION!! " + e.getMessage())
        throw e
      }
    }
  }
}

### Create the database and tables.
* Creates the tpc-ds database.
* For each of the table name in TPC-DS schema, calls up on the function to create the table in spark.

In [6]:
// TPC-DS table names.
val tables = Array("call_center", "catalog_sales",
                   "customer_demographics", "income_band",
                   "promotion", "store", "time_dim", "web_returns",
                   "catalog_page", "customer", "date_dim",
                   "inventory", "reason", "store_returns", "warehouse",
                   "web_sales", "catalog_returns", "customer_address",
                   "household_demographics", "item", "ship_mode", "store_sales",
                   "web_page", "web_site" )

// Create database
createDatabase

// Create table
forEachTable(tables, table => createTable(table))


Creating table call_center ..
Creating table catalog_sales ..
Creating table customer_demographics ..
Creating table income_band ..
Creating table promotion ..
Creating table store ..
Creating table time_dim ..
Creating table web_returns ..
Creating table catalog_page ..
Creating table customer ..
Creating table date_dim ..
Creating table inventory ..
Creating table reason ..
Creating table store_returns ..
Creating table warehouse ..
Creating table web_sales ..
Creating table catalog_returns ..
Creating table customer_address ..
Creating table household_demographics ..
Creating table item ..
Creating table ship_mode ..
Creating table store_sales ..
Creating table web_page ..
Creating table web_site ..


## Verify table creating and data loading.
* Run a simple Spark SQL query to get the count of rows
* Verify that the row counts are as expected

In [7]:
// Run a count query and get the counts
val rowCounts = tables.map { table =>
    journey_spark.table(table).count()
}

val expectedCounts = Array (
    6, 1441548, 1920800, 20, 300, 12, 86400,
    71763,  11718, 100000, 73049, 11745000, 
    35, 287514, 5, 719384, 144067, 50000, 7200,
    18000, 20, 2880404, 60, 30
)

var errorCount = 0;
val zippedCountsWithIndex = rowCounts.zip(expectedCounts).zipWithIndex
for ((pair, index) <- zippedCountsWithIndex) {
    if (pair._1 != pair._2) {
        println(s"""ERROR!! Row counts for ${tables(index)} does not match.
        Expected=${expectedCounts(index)} but found ${rowCounts(index)}""")
        errorCount += 1
    }
}

println("=====================================================")
if ( errorCount > 0) {
  println(s"Load verification failed with $errorCount errors")
} else {
  println("Loaded and verified the table counts successfully")
}
println("=====================================================")

Loaded and verified the table counts successfully


## Run a query by its number
* Given a query number between 1 to 99, run it against spark.
* Display the query results, time taken to execute the query and number of rows returned.
* To run a differnt query , please change the QUERY_NUM to a valid value from 1 to 99.

In [8]:
val QUERY_NUM = 32
runQuery(QUERY_NUM)

Running TPC-DS Query tpcds-journey/src/queries/query32.sql
Time elapsed: 11, Number of rows: 1


In [9]:
var queryErrors = 0
for (i <- 1 to 33) {
  try{
    runQuery(i)
  } catch {
     case e: Throwable => {
          println("Error in query "+ i + "msg = " + e.getMessage)
          queryErrors += 1
     }
  }
}
println("=====================================================")
if ( queryErrors > 0) {
  println(s"Query execution failed with $queryErrors errors")
} else {
  println("All TPC-DS queries ran successfully")
}
println (s"Total Elapsed Time so far: ${totalTime} seconds.")
println("=====================================================")

Running TPC-DS Query tpcds-journey/src/queries/query01.sql
Time elapsed: 14, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query02.sql
Time elapsed: 6, Number of rows: 2513
Running TPC-DS Query tpcds-journey/src/queries/query03.sql
Time elapsed: 11, Number of rows: 89
Running TPC-DS Query tpcds-journey/src/queries/query04.sql
Time elapsed: 41, Number of rows: 8
Running TPC-DS Query tpcds-journey/src/queries/query05.sql
Time elapsed: 18, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query06.sql
Time elapsed: 26, Number of rows: 45
Running TPC-DS Query tpcds-journey/src/queries/query07.sql
Time elapsed: 8, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query08.sql
Time elapsed: 9, Number of rows: 5
Running TPC-DS Query tpcds-journey/src/queries/query09.sql
Time elapsed: 2, Number of rows: 1
Running TPC-DS Query tpcds-journey/src/queries/query10.sql
Time elapsed: 10, Number of rows: 5
Running TPC-DS Query tpcds-journey/src/quer

In [10]:
var queryErrors = 0
for (i <- 34 to 66) {
  try{
    runQuery(i)
  } catch {
     case e: Throwable => {
          println("Error in query "+ i )
          queryErrors += 1
     }
  }
}
println("=====================================================")
if ( queryErrors > 0) {
  println(s"Query execution failed with $queryErrors errors")
} else {
  println("All TPC-DS queries ran successfully")
}
println (s"Total Elapsed Time so far: ${totalTime} seconds.")
println("=====================================================")

Running TPC-DS Query tpcds-journey/src/queries/query34.sql
Time elapsed: 7, Number of rows: 451
Running TPC-DS Query tpcds-journey/src/queries/query35.sql
Time elapsed: 15, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query36.sql
Time elapsed: 7, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query37.sql
Time elapsed: 2, Number of rows: 1
Running TPC-DS Query tpcds-journey/src/queries/query38.sql
Time elapsed: 17, Number of rows: 1
Running TPC-DS Query tpcds-journey/src/queries/query39.sql
Time elapsed: 25, Number of rows: 246
Time elapsed: 31, Number of rows: 17
Running TPC-DS Query tpcds-journey/src/queries/query40.sql
Time elapsed: 7, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query41.sql
Time elapsed: 1, Number of rows: 4
Running TPC-DS Query tpcds-journey/src/queries/query42.sql
Time elapsed: 3, Number of rows: 10
Running TPC-DS Query tpcds-journey/src/queries/query43.sql
Time elapsed: 3, Number of rows: 6
Running T

In [11]:
var queryErrors = 0
for (i <- 67 to 99) {
  try{
    runQuery(i)
  } catch {
     case e: Throwable => {
          println("Error in query "+ i )
          queryErrors += 1
     }
  }
}
println("=====================================================")
if ( queryErrors > 0) {
  println(s"Query execution failed with $queryErrors errors")
} else {
  println("All TPC-DS queries ran successfully")
}
println (s"Total Elapsed Time so far: ${totalTime} seconds.")
println("=====================================================")

Running TPC-DS Query tpcds-journey/src/queries/query67.sql
Time elapsed: 11, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query68.sql
Time elapsed: 7, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query69.sql
Time elapsed: 12, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query70.sql
Time elapsed: 6, Number of rows: 3
Running TPC-DS Query tpcds-journey/src/queries/query71.sql
Time elapsed: 9, Number of rows: 1018
Running TPC-DS Query tpcds-journey/src/queries/query72.sql
Time elapsed: 101, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query73.sql
Time elapsed: 5, Number of rows: 5
Running TPC-DS Query tpcds-journey/src/queries/query74.sql
Time elapsed: 26, Number of rows: 92
Running TPC-DS Query tpcds-journey/src/queries/query75.sql
Time elapsed: 52, Number of rows: 100
Running TPC-DS Query tpcds-journey/src/queries/query76.sql
Time elapsed: 9, Number of rows: 100
Running TPC-DS Query tpcds-journey/src