Logistic Regression Example with MLlib and Spark ML
====

Start up Spark
-------------

In [None]:
import os
import sys

spark_home = '/opt/spark'
os.environ['SPARK_HOME'] = spark_home

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.1-src.zip'))

execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

Spark MLlib
----------


In [None]:
rdd = sc.textFile("data/logreg.txt")
rdd

In [None]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint

def parsePoint(line):
    values = [float(s) for s in line.split(' ')]
    return LabeledPoint(values[0], DenseVector(values[1:]))

points = rdd.map(parsePoint)
points.collect()[0:10]

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithSGD

model = LogisticRegressionWithSGD.train(points, 100)

Let's predict:

In [None]:
model.predict([0.6,0.6])

In [None]:
model.setThreshold(0.8)
model.predict([0.6,0.6])

In [None]:
model.predict([100,0.6])

Spark ML
-------

Read training and test data. In this case test data is labeled as well (we will generate our label based on the `arrdelay` field) 

In [None]:
training = sqlContext.read.parquet("data/training.parquet")
test = sqlContext.read.parquet("data/test.parquet")

test.printSchema()

In [None]:
test.first()

In [None]:
training.persist(StorageLevel.MEMORY_ONLY)
test.persist(StorageLevel.MEMORY_ONLY)

Generate label column for the training data

In [None]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

is_late = udf(lambda delay: 1.0 if delay > 0 else 0.0, DoubleType())
training = training.withColumn("is_late",is_late(training.arrdelay))


Create and fit Spark ML model

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# Create feature vectors. Ignore arr_delay and it's derivate, is_late
feature_assembler = VectorAssembler(
    inputCols=[x for x in training.columns if x not in ["is_late","arrdelay"]],
    outputCol="features")

reg = LogisticRegression().setParams(
    maxIter = 100,
    labelCol="is_late",
    predictionCol="prediction")

model = Pipeline(stages=[feature_assembler, reg]).fit(training)


Predict whether the aircraft will be late

In [None]:
predicted = model.transform(test)

Check model performance

In [None]:
predicted = predicted.withColumn("is_late",is_late(predicted.arrdelay))
predicted.crosstab("is_late","prediction").show()