# Want to see my Transmogrif-AI-er

![](transmogrify.png)

# Summary

# Check Scala Version

In [1]:
scala.util.Properties.versionString

version 2.11.12

# Get TransmogrifAI

In [2]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.5.0

# Get Spark

In [3]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.3.0

# Get Started

In [6]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.classification._
import com.salesforce.op.evaluators.Evaluators

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.classification._
import com.salesforce.op.evaluators.Evaluators


In [7]:
val conf = new SparkConf().setMaster("local[*]").setAppName("automl-app") // Spark configuration
val sc = new SparkContext(conf)  // initialize spark context
val sqlContext = new org.apache.spark.sql.SQLContext(sc)  // initialize sql context
implicit val spark = SparkSession.builder.config(conf).getOrCreate() // start spark session 
import spark.implicits._

org.apache.spark.sql.SparkSession$implicits$@48ee338

# Get Data

In [9]:
val rawData = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load("train.csv")
rawData.printSchema

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



null

In [10]:
println(rawData.columns.mkString(","))
rawData.take(5).foreach(println)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
[1,0,3,Braund, Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,null,S]
[2,1,1,Cumings, Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C]
[3,1,3,Heikkinen, Miss. Laina,female,26.0,0,0,STON/O2. 3101282,7.925,null,S]
[4,1,1,Futrelle, Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S]
[5,0,3,Allen, Mr. William Henry,male,35.0,0,0,373450,8.05,null,S]


In [16]:
// cast all non doulbe numeric types to double
rawData.createOrReplaceTempView("raw")
val passengerData = spark.sql("""
    select 
      cast(passengerId as double) as id, 
      cast(survived as double) as survived, 
      cast(pclass as double) as pclass,+
      name, sex, age, 
      cast(sibsp as double) as sibsp, 
      cast(parch as double) as parch, 
      ticket, fare, cabin, embarked 
      from raw
""")

[id: double, survived: double ... 10 more fields]

In [17]:
val Array(train, test) = passengerData.randomSplit( Array(0.7, 0.3))

[id: double, survived: double ... 10 more fields]

# Declare Target and Features

In [12]:
val (target, features) = FeatureBuilder.fromDataFrame[RealNN](passengerData, response = "survived")
OutputCell.HIDDEN

# Transmogrify (Feature Engineering)

In [13]:
val featureVector = features.transmogrify()

Feature(name = age-cabin-embarked-fare-id-name-parch-pclass-sex-sibsp-ticket_3-stagesApplied_OPVector_00000000000f, uid = OPVector_00000000000f, isResponse = false, originStage = VectorsCombiner_00000000000f, parents = [OPVector_00000000000d,OPVector_00000000000e], distributions = [])

# Sanity Check (Feature Refinement)

In [14]:
val checkedFeatures = target.sanityCheck(featureVector, removeBadFeatures = true)

Feature(name = age-cabin-embarked-fare-id-name-parch-pclass-sex-sibsp-survived-ticket_4-stagesApplied_OPVector_000000000010, uid = OPVector_000000000010, isResponse = false, originStage = SanityChecker_000000000010, parents = [RealNN_000000000002,OPVector_00000000000f], distributions = [])

# Model Selection

In [15]:
val prediction = BinaryClassificationModelSelector.
withCrossValidation(seed=142L)
.setInput(target, checkedFeatures).setOutputFeatureName("prediction").getOutput()

Feature(name = prediction, uid = Prediction_00000000001c, isResponse = true, originStage = ModelSelector_00000000001c, parents = [RealNN_000000000002,OPVector_000000000010], distributions = [])

# Setting up a TransmogrifAI Workflow

In [18]:
val workflow = new OpWorkflow().setInputDataset(train).setResultFeatures(prediction)

com.salesforce.op.OpWorkflow@2b40166f

# Train a Workflow

In [19]:
val fittedWorkflow = workflow.train()

com.salesforce.op.OpWorkflowModel@68be22d9

In [20]:
println("Model summary:\n" + fittedWorkflow.summaryPretty())

Model summary:
Evaluated OpLogisticRegression, OpGBTClassifier, OpRandomForestClassifier, OpLinearSVC models using Cross Validation and area under precision-recall metric.
Evaluated 8 OpLogisticRegression models with area under precision-recall metric between [0.7086101706244603, 0.7942363464210933].
Evaluated 18 OpGBTClassifier models with area under precision-recall metric between [0.6903514842064713, 0.7879093921853755].
Evaluated 18 OpRandomForestClassifier models with area under precision-recall metric between [0.5381118184952709, 0.7743723627023575].
Evaluated 4 OpLinearSVC models with area under precision-recall metric between [0.6980978665209828, 0.7243148146829335].
+--------------------------------------------------------+
|         Selected Model - OpLogisticRegression          |
+--------------------------------------------------------+
| Model Param      | Value                               |
+------------------+-------------------------------------+
| aggregationDepth | 

# Evaluate on Test Data

In [37]:
val evaluator = Evaluators.BinaryClassification()
   .setLabelCol(target)
   .setPredictionCol(prediction)

OpBinaryClassificationEvaluator_000000000046

In [38]:
fittedWorkflow.setInputDataset(test)

com.salesforce.op.OpWorkflowModel@68be22d9

In [39]:
val (scoredTestData, metrics) = fittedWorkflow.scoreAndEvaluate(evaluator = evaluator)
OutputCell.HIDDEN

In [40]:
metrics.toMap

In [41]:
(68 + 146)/test.count.toDouble

0.8075471698113208

# Kaggle Test Data

In [42]:
// import data

val rawTestData = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load("test.csv")
rawTestData.createOrReplaceTempView("rawTest")
val passengerTestData = spark.sql("""
    select 
      cast(passengerId as double) as id, 
      cast(1 as double) as survived, 
      cast(pclass as double) as pclass,
      name, sex, age, 
      cast(sibsp as double) as sibsp, 
      cast(parch as double) as parch, 
      ticket, fare, cabin, embarked 
      from rawTest
""")

fittedWorkflow.setInputDataset(passengerTestData)

val output = fittedWorkflow.computeDataUpTo(prediction).select("id", "prediction")

import java.io.{File, PrintWriter}

val local = output.rdd.map{
    row => (row.get(0), row.getAs[Map[String,Double]](1))
}.collect.map{case(x,y) => (x, y.get("probability_1").get)}.map{ 
 case(x,y) => (x.asInstanceOf[Double].toInt, if(y > 0.5) 1 else 0)
}

val myFile = new File("myprediction.csv")
val pw = new PrintWriter(myFile)
pw.write("PassengerId,Survived\n")
for(i <- local) { 
    val (t1, t2) = i
    pw.write(s"$t1,$t2\n")
}
pw.close

null