In [1]:
# spark 
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# pipeline 
from pyspark.ml import Pipeline

# Indexer, vector, onhotencoder...
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, 
                                OneHotEncoder, StringIndexer)

In [2]:
# config 
conf = SparkConf().setAppName("building a PIPELINE MODEL")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)
spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [3]:
# LOAD THE CSV DATA 

# data = sc.textFile('titanic_train.csv').map(lambda line: line.split(","))
# headers = data.first()
# traindata = data.filter(lambda row: row != headers)
# sqlContext = SQLContext(sc)
# df = sqlContext.createDataFrame(traindata, [ 'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
#        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])

df = spark.read.csv("titanic_train.csv", header = True)
df = df.withColumn("Pclass", df["Pclass"].cast("float"))
df = df.withColumn("Age", df["Age"].cast("float"))
df = df.withColumn("Parch", df["Parch"].cast("float"))
df = df.withColumn("Survived", df["Survived"].cast("integer"))


In [4]:
df.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: float (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: float (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: float (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|   3.0|Braund, Mr. Owen ...|  male|22.0|    1|  0.0|       A/5 21171|   7.25| null|       S|
|          2|       1|   1.0|Cumings, Mrs. Joh...|female|38.0|    1|  0.0|        PC 17599|71.2833|  C85|       C|
|          3|       1|   3.0|Heikkinen, Miss. ...|female|26.0|    0|  0.0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|   1.0|Futrelle, Mrs. Ja...|female|35.0|    1|  0.0|          113803|   53.1| C123|       S|
|          5|       0|   3.0|Allen, Mr. Willia...|  male|35.0|    0|  0.0|          373450|   8.05| null|       S|
|          6|       0|   3.0|    Moran, Mr. James|  male|null|    0|  0.0|      

In [6]:
# select working columns 
my_cols = df.select(['Survived', 'Pclass', 'Sex', 'Age','Parch'])

In [7]:
my_cols.show()

+--------+------+------+----+-----+
|Survived|Pclass|   Sex| Age|Parch|
+--------+------+------+----+-----+
|       0|   3.0|  male|22.0|  0.0|
|       1|   1.0|female|38.0|  0.0|
|       1|   3.0|female|26.0|  0.0|
|       1|   1.0|female|35.0|  0.0|
|       0|   3.0|  male|35.0|  0.0|
|       0|   3.0|  male|null|  0.0|
|       0|   1.0|  male|54.0|  0.0|
|       0|   3.0|  male| 2.0|  1.0|
|       1|   3.0|female|27.0|  2.0|
|       1|   2.0|female|14.0|  0.0|
|       1|   3.0|female| 4.0|  1.0|
|       1|   1.0|female|58.0|  0.0|
|       0|   3.0|  male|20.0|  0.0|
|       0|   3.0|  male|39.0|  5.0|
|       0|   3.0|female|14.0|  0.0|
|       1|   2.0|female|55.0|  0.0|
|       0|   3.0|  male| 2.0|  1.0|
|       1|   2.0|  male|null|  0.0|
|       0|   3.0|female|31.0|  0.0|
|       1|   3.0|female|null|  0.0|
+--------+------+------+----+-----+
only showing top 20 rows



In [8]:
# drop null data
my_final_data = my_cols.na.drop()

In [9]:
my_final_data.show()

+--------+------+------+----+-----+
|Survived|Pclass|   Sex| Age|Parch|
+--------+------+------+----+-----+
|       0|   3.0|  male|22.0|  0.0|
|       1|   1.0|female|38.0|  0.0|
|       1|   3.0|female|26.0|  0.0|
|       1|   1.0|female|35.0|  0.0|
|       0|   3.0|  male|35.0|  0.0|
|       0|   1.0|  male|54.0|  0.0|
|       0|   3.0|  male| 2.0|  1.0|
|       1|   3.0|female|27.0|  2.0|
|       1|   2.0|female|14.0|  0.0|
|       1|   3.0|female| 4.0|  1.0|
|       1|   1.0|female|58.0|  0.0|
|       0|   3.0|  male|20.0|  0.0|
|       0|   3.0|  male|39.0|  5.0|
|       0|   3.0|female|14.0|  0.0|
|       1|   2.0|female|55.0|  0.0|
|       0|   3.0|  male| 2.0|  1.0|
|       0|   3.0|female|31.0|  0.0|
|       0|   2.0|  male|35.0|  0.0|
|       1|   2.0|  male|34.0|  0.0|
|       1|   3.0|female|15.0|  0.0|
+--------+------+------+----+-----+
only showing top 20 rows



In [10]:
#  index columns : Sex

gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

# one hot encoding 

# e.g.
# A B C 
# index 
# 0 1 2 
# ONE HOT ENCODE 
# [1,0,0] [0,1,0] [0,0,1]

gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [11]:
# index columns  :  embarked

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [12]:
# Vectorize features 
# transform needed columns to "feature vector" ( as spark milb model loading data style)

assembler = VectorAssembler(inputCols= ['Pclass', 'SexVec', 'Age','Parch'],
                            outputCol= 'features')

In [13]:
assembler

VectorAssembler_452fa8fa46b31330c919

In [14]:
# set up ml model 

log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [15]:
##########  pipeline : stage for different steps ##########
# step : 
# indexing --> encode -> train 

pipeline = Pipeline(stages=[gender_indexer, #embark_indexer,
                            gender_encoder, #embark_encoder,
                            assembler, log_reg_titanic])

In [16]:
# split train, test data
train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

In [17]:
# train the model
fit_model = pipeline.fit(train_data)

In [18]:
train_data.show()

+--------+------+------+----+-----+
|Survived|Pclass|   Sex| Age|Parch|
+--------+------+------+----+-----+
|       0|   1.0|female|25.0|  2.0|
|       0|   1.0|female|50.0|  0.0|
|       0|   1.0|  male|18.0|  0.0|
|       0|   1.0|  male|19.0|  0.0|
|       0|   1.0|  male|19.0|  2.0|
|       0|   1.0|  male|21.0|  1.0|
|       0|   1.0|  male|22.0|  0.0|
|       0|   1.0|  male|24.0|  1.0|
|       0|   1.0|  male|28.0|  0.0|
|       0|   1.0|  male|29.0|  0.0|
|       0|   1.0|  male|30.0|  0.0|
|       0|   1.0|  male|31.0|  0.0|
|       0|   1.0|  male|33.0|  0.0|
|       0|   1.0|  male|36.0|  0.0|
|       0|   1.0|  male|38.0|  0.0|
|       0|   1.0|  male|39.0|  0.0|
|       0|   1.0|  male|40.0|  0.0|
|       0|   1.0|  male|40.0|  0.0|
|       0|   1.0|  male|42.0|  0.0|
|       0|   1.0|  male|45.0|  0.0|
+--------+------+------+----+-----+
only showing top 20 rows



In [19]:
# do the prediction 

results = fit_model.transform(test_data)

In [20]:
results.show()

+--------+------+------+----+-----+--------+-------------+------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|Parch|SexIndex|       SexVec|          features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+--------+-------------+------------------+--------------------+--------------------+----------+
|       0|   1.0|female| 2.0|  2.0|     1.0|    (1,[],[])| [1.0,0.0,2.0,2.0]|[-3.3980634829368...|[0.03235604048097...|       1.0|
|       0|   1.0|  male|24.0|  0.0|     0.0|(1,[0],[1.0])|[1.0,1.0,24.0,0.0]|[-0.4804758908962...|[0.38213975682422...|       1.0|
|       0|   1.0|  male|27.0|  2.0|     0.0|(1,[0],[1.0])|[1.0,1.0,27.0,2.0]|[-0.0697092202984...|[0.48257974864920...|       1.0|
|       0|   1.0|  male|28.0|  0.0|     0.0|(1,[0],[1.0])|[1.0,1.0,28.0,0.0]|[-0.3430649231130...|[0.41506516324019...|       1.0|
|       0|   1.0|  male|29.0|  0.0|     0.0|(1,[0],[1.0])|[1.0,1.0,29.0,0.0]|[-0.30

In [21]:
# results.show()
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [22]:
# by default, the "transform" data is with name "prediction"
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [23]:
# evaluation 

AUC = my_eval.evaluate(results)
AUC

0.8003875968992249

In [24]:
# end of 12.42
# next : 13 