In [22]:
# In this learning process, I am tryig to build the following model:
# Model: To Predict survival on the Titanic. Ref: https://www.kaggle.com/c/titanic/overview

In [35]:
import findspark
findspark.init()

In [36]:
# Loading the data into Spark using DataFrames

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Titanic Data').getOrCreate()

spark # prints what's in the object

In [37]:
# Read the training data and create a DataFrames
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("./data/train.csv"))

df # prints the DF structure

DataFrame[PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

In [38]:
df.show() # prints some records from the DF
df.count() # prints count of records in the training data


+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

891

In [39]:
# Prepare a dataset by casting some of columns to required type!

from pyspark.sql.functions import col
dataset = df.select(col('Survived').cast('float'),
                         col('Pclass').cast('float'),
                         col('Sex'),
                         col('Age').cast('float'),
                         col('Fare').cast('float'),
                         col('Embarked')
                        )
dataset.show()

+--------+------+------+----+-------+--------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|
+--------+------+------+----+-------+--------+
|     0.0|   3.0|  male|22.0|   7.25|       S|
|     1.0|   1.0|female|38.0|71.2833|       C|
|     1.0|   3.0|female|26.0|  7.925|       S|
|     1.0|   1.0|female|35.0|   53.1|       S|
|     0.0|   3.0|  male|35.0|   8.05|       S|
|     0.0|   3.0|  male|null| 8.4583|       Q|
|     0.0|   1.0|  male|54.0|51.8625|       S|
|     0.0|   3.0|  male| 2.0| 21.075|       S|
|     1.0|   3.0|female|27.0|11.1333|       S|
|     1.0|   2.0|female|14.0|30.0708|       C|
|     1.0|   3.0|female| 4.0|   16.7|       S|
|     1.0|   1.0|female|58.0|  26.55|       S|
|     0.0|   3.0|  male|20.0|   8.05|       S|
|     0.0|   3.0|  male|39.0| 31.275|       S|
|     0.0|   3.0|female|14.0| 7.8542|       S|
|     1.0|   2.0|female|55.0|   16.0|       S|
|     0.0|   3.0|  male| 2.0| 29.125|       Q|
|     1.0|   2.0|  male|null|   13.0|       S|
|     0.0|   

In [40]:
from pyspark.sql.functions import isnull, when, count, col
dataset.select([count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show() # todo: learn to query

+--------+------+---+---+----+--------+
|Survived|Pclass|Sex|Age|Fare|Embarked|
+--------+------+---+---+----+--------+
|       0|     0|  0|177|   0|       2|
+--------+------+---+---+----+--------+



In [61]:
# Eliminate rows with null values in columns
dataset = dataset.replace('?', None).dropna(how='any') # todo: learn the Dataset API
dataset.show()
dataset.count()

+--------+------+----+-------+------+-------+
|Survived|Pclass| Age|   Fare|Gender|Boarded|
+--------+------+----+-------+------+-------+
|     0.0|   3.0|22.0|   7.25|   0.0|    0.0|
|     1.0|   1.0|38.0|71.2833|   1.0|    1.0|
|     1.0|   3.0|26.0|  7.925|   1.0|    0.0|
|     1.0|   1.0|35.0|   53.1|   1.0|    0.0|
|     0.0|   3.0|35.0|   8.05|   0.0|    0.0|
|     0.0|   1.0|54.0|51.8625|   0.0|    0.0|
|     0.0|   3.0| 2.0| 21.075|   0.0|    0.0|
|     1.0|   3.0|27.0|11.1333|   1.0|    0.0|
|     1.0|   2.0|14.0|30.0708|   1.0|    1.0|
|     1.0|   3.0| 4.0|   16.7|   1.0|    0.0|
|     1.0|   1.0|58.0|  26.55|   1.0|    0.0|
|     0.0|   3.0|20.0|   8.05|   0.0|    0.0|
|     0.0|   3.0|39.0| 31.275|   0.0|    0.0|
|     0.0|   3.0|14.0| 7.8542|   1.0|    0.0|
|     1.0|   2.0|55.0|   16.0|   1.0|    0.0|
|     0.0|   3.0| 2.0| 29.125|   0.0|    2.0|
|     0.0|   3.0|31.0|   18.0|   1.0|    0.0|
|     0.0|   2.0|35.0|   26.0|   0.0|    0.0|
|     1.0|   2.0|34.0|   13.0|   0

712

In [42]:
# Spark ML library only works with numeric data. 
# But we still want to use the Sex and the Embarked column. 
# For that, we will need to encode (transform) them. Sex -> Gender; Embarked -> Boarded

from pyspark.ml.feature import StringIndexer # todo: learn Spark ML API
dataset = StringIndexer(
    inputCol='Sex', 
    outputCol='Gender', 
    handleInvalid='keep').fit(dataset).transform(dataset)
dataset = StringIndexer(
    inputCol='Embarked', 
    outputCol='Boarded', 
    handleInvalid='keep').fit(dataset).transform(dataset)

dataset.show()


+--------+------+------+----+-------+--------+------+-------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|Gender|Boarded|
+--------+------+------+----+-------+--------+------+-------+
|     0.0|   3.0|  male|22.0|   7.25|       S|   0.0|    0.0|
|     1.0|   1.0|female|38.0|71.2833|       C|   1.0|    1.0|
|     1.0|   3.0|female|26.0|  7.925|       S|   1.0|    0.0|
|     1.0|   1.0|female|35.0|   53.1|       S|   1.0|    0.0|
|     0.0|   3.0|  male|35.0|   8.05|       S|   0.0|    0.0|
|     0.0|   1.0|  male|54.0|51.8625|       S|   0.0|    0.0|
|     0.0|   3.0|  male| 2.0| 21.075|       S|   0.0|    0.0|
|     1.0|   3.0|female|27.0|11.1333|       S|   1.0|    0.0|
|     1.0|   2.0|female|14.0|30.0708|       C|   1.0|    1.0|
|     1.0|   3.0|female| 4.0|   16.7|       S|   1.0|    0.0|
|     1.0|   1.0|female|58.0|  26.55|       S|   1.0|    0.0|
|     0.0|   3.0|  male|20.0|   8.05|       S|   0.0|    0.0|
|     0.0|   3.0|  male|39.0| 31.275|       S|   0.0|    0.0|
|     0.

In [43]:
dataset.dtypes

[('Survived', 'float'),
 ('Pclass', 'float'),
 ('Sex', 'string'),
 ('Age', 'float'),
 ('Fare', 'float'),
 ('Embarked', 'string'),
 ('Gender', 'double'),
 ('Boarded', 'double')]

In [62]:
# Drop unnecessary columns
dataset = dataset.drop('Sex')
dataset = dataset.drop('Embarked')
dataset.show()
dataset.count()

+--------+------+----+-------+------+-------+
|Survived|Pclass| Age|   Fare|Gender|Boarded|
+--------+------+----+-------+------+-------+
|     0.0|   3.0|22.0|   7.25|   0.0|    0.0|
|     1.0|   1.0|38.0|71.2833|   1.0|    1.0|
|     1.0|   3.0|26.0|  7.925|   1.0|    0.0|
|     1.0|   1.0|35.0|   53.1|   1.0|    0.0|
|     0.0|   3.0|35.0|   8.05|   0.0|    0.0|
|     0.0|   1.0|54.0|51.8625|   0.0|    0.0|
|     0.0|   3.0| 2.0| 21.075|   0.0|    0.0|
|     1.0|   3.0|27.0|11.1333|   1.0|    0.0|
|     1.0|   2.0|14.0|30.0708|   1.0|    1.0|
|     1.0|   3.0| 4.0|   16.7|   1.0|    0.0|
|     1.0|   1.0|58.0|  26.55|   1.0|    0.0|
|     0.0|   3.0|20.0|   8.05|   0.0|    0.0|
|     0.0|   3.0|39.0| 31.275|   0.0|    0.0|
|     0.0|   3.0|14.0| 7.8542|   1.0|    0.0|
|     1.0|   2.0|55.0|   16.0|   1.0|    0.0|
|     0.0|   3.0| 2.0| 29.125|   0.0|    2.0|
|     0.0|   3.0|31.0|   18.0|   1.0|    0.0|
|     0.0|   2.0|35.0|   26.0|   0.0|    0.0|
|     1.0|   2.0|34.0|   13.0|   0

712

In [49]:
# Spark works to predict with a column with all the features smashed together into a list-like structure.
# I want to predict “Survived”, I need to combine the information other columns into one column.
# That column is called "features" and it's value should look like say [3.0, 22.0, 7.25, 0, 0]

# Assemble all the features with VectorAssembler
required_features = ['Pclass',
                    'Age',
                    'Fare',
                    'Gender',
                    'Boarded'
                   ]
from pyspark.ml.feature import VectorAssembler # todo
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

transformed_data

DataFrame[Survived: float, Pclass: float, Age: float, Fare: float, Gender: double, Boarded: double, features: vector]

In [63]:
transformed_data.show()
transformed_data.count()

+--------+------+----+-------+------+-------+--------------------+
|Survived|Pclass| Age|   Fare|Gender|Boarded|            features|
+--------+------+----+-------+------+-------+--------------------+
|     0.0|   3.0|22.0|   7.25|   0.0|    0.0|[3.0,22.0,7.25,0....|
|     1.0|   1.0|38.0|71.2833|   1.0|    1.0|[1.0,38.0,71.2833...|
|     1.0|   3.0|26.0|  7.925|   1.0|    0.0|[3.0,26.0,7.92500...|
|     1.0|   1.0|35.0|   53.1|   1.0|    0.0|[1.0,35.0,53.0999...|
|     0.0|   3.0|35.0|   8.05|   0.0|    0.0|[3.0,35.0,8.05000...|
|     0.0|   1.0|54.0|51.8625|   0.0|    0.0|[1.0,54.0,51.8624...|
|     0.0|   3.0| 2.0| 21.075|   0.0|    0.0|[3.0,2.0,21.07500...|
|     1.0|   3.0|27.0|11.1333|   1.0|    0.0|[3.0,27.0,11.1332...|
|     1.0|   2.0|14.0|30.0708|   1.0|    1.0|[2.0,14.0,30.0708...|
|     1.0|   3.0| 4.0|   16.7|   1.0|    0.0|[3.0,4.0,16.70000...|
|     1.0|   1.0|58.0|  26.55|   1.0|    0.0|[1.0,58.0,26.5499...|
|     0.0|   3.0|20.0|   8.05|   0.0|    0.0|[3.0,20.0,8.05000

712

In [50]:
# At this point our data prep is done
# We will start Modeling now..

In [51]:
# Before modeling let’s do the usual splitting between training and testing
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [59]:
# We randomly select 80% of the transformed data and use it as training_data
training_data.show()
training_data.count()

+--------+------+----+--------+------+-------+--------------------+
|Survived|Pclass| Age|    Fare|Gender|Boarded|            features|
+--------+------+----+--------+------+-------+--------------------+
|     0.0|   1.0| 2.0|  151.55|   1.0|    0.0|[1.0,2.0,151.5500...|
|     0.0|   1.0|19.0|    53.1|   0.0|    0.0|[1.0,19.0,53.0999...|
|     0.0|   1.0|19.0|   263.0|   0.0|    0.0|[1.0,19.0,263.0,0...|
|     0.0|   1.0|21.0| 77.2875|   0.0|    0.0|[1.0,21.0,77.2874...|
|     0.0|   1.0|22.0|135.6333|   0.0|    1.0|[1.0,22.0,135.633...|
|     0.0|   1.0|24.0|    79.2|   0.0|    1.0|[1.0,24.0,79.1999...|
|     0.0|   1.0|27.0|   211.5|   0.0|    1.0|[1.0,27.0,211.5,0...|
|     0.0|   1.0|29.0|    30.0|   0.0|    0.0|[1.0,29.0,30.0,0....|
|     0.0|   1.0|29.0|    66.6|   0.0|    0.0|[1.0,29.0,66.5999...|
|     0.0|   1.0|30.0|   27.75|   0.0|    1.0|[1.0,30.0,27.75,0...|
|     0.0|   1.0|31.0| 50.4958|   0.0|    0.0|[1.0,31.0,50.4958...|
|     0.0|   1.0|33.0|     5.0|   0.0|    0.0|[1

590

In [60]:
# remaining 20% as test_data

test_data.show()
test_data.count()

+--------+------+----+--------+------+-------+--------------------+
|Survived|Pclass| Age|    Fare|Gender|Boarded|            features|
+--------+------+----+--------+------+-------+--------------------+
|     0.0|   1.0|18.0|   108.9|   0.0|    1.0|[1.0,18.0,108.900...|
|     0.0|   1.0|24.0|247.5208|   0.0|    1.0|[1.0,24.0,247.520...|
|     0.0|   1.0|25.0|  151.55|   1.0|    0.0|[1.0,25.0,151.550...|
|     0.0|   1.0|28.0|    47.1|   0.0|    0.0|[1.0,28.0,47.0999...|
|     0.0|   1.0|28.0| 82.1708|   0.0|    1.0|[1.0,28.0,82.1707...|
|     0.0|   1.0|31.0|    52.0|   0.0|    0.0|[1.0,31.0,52.0,0....|
|     0.0|   1.0|37.0|    29.7|   0.0|    1.0|[1.0,37.0,29.7000...|
|     0.0|   1.0|38.0|     0.0|   0.0|    0.0|(5,[0,1],[1.0,38.0])|
|     0.0|   1.0|45.0|    35.5|   0.0|    0.0|[1.0,45.0,35.5,0....|
|     0.0|   1.0|47.0| 25.5875|   0.0|    0.0|[1.0,47.0,25.5874...|
|     0.0|   1.0|49.0|110.8833|   0.0|    1.0|[1.0,49.0,110.883...|
|     0.0|   1.0|54.0| 51.8625|   0.0|    0.0|[1

122

In [68]:
# Build and fit an ML model to our dataset to predict the “Survived” columns with all the other ones. 
# We will be using a Random Forest Classifier. This is actually an estimator that we have to fit.

from pyspark.ml.classification import RandomForestClassifier # todo
rf = RandomForestClassifier(labelCol='Survived', 
                            featuresCol='features',
                            maxDepth=5)

In [70]:
# Now we fit the model
model = rf.fit(training_data)

# This will give us something called a transformer.

model

RandomForestClassificationModel (uid=RandomForestClassifier_9beba4e5d150) with 20 trees

In [71]:
# And finally, we predict using the test dataset
predictions = model.transform(test_data)

predictions

DataFrame[Survived: float, Pclass: float, Age: float, Fare: float, Gender: double, Boarded: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [72]:
# Done! My first Spark ML model
predictions.show()

+--------+------+----+--------+------+-------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass| Age|    Fare|Gender|Boarded|            features|       rawPrediction|         probability|prediction|
+--------+------+----+--------+------+-------+--------------------+--------------------+--------------------+----------+
|     0.0|   1.0|18.0|   108.9|   0.0|    1.0|[1.0,18.0,108.900...|[11.6333451394326...|[0.58166725697163...|       0.0|
|     0.0|   1.0|24.0|247.5208|   0.0|    1.0|[1.0,24.0,247.520...|[11.3909573189198...|[0.56954786594599...|       0.0|
|     0.0|   1.0|25.0|  151.55|   1.0|    0.0|[1.0,25.0,151.550...|[0.93400643275822...|[0.04670032163791...|       1.0|
|     0.0|   1.0|28.0|    47.1|   0.0|    0.0|[1.0,28.0,47.0999...|[9.77377106754905...|[0.48868855337745...|       1.0|
|     0.0|   1.0|28.0| 82.1708|   0.0|    1.0|[1.0,28.0,82.1707...|[9.87266677689661...|[0.49363333884483...|       1.0|
|     0.0|   1.0|31.0|    52.0| 

In [74]:
# Evaluate the model using a basic metric called the accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # todo
evaluator = MulticlassClassificationEvaluator(
    labelCol='Survived', 
    predictionCol='prediction', 
    metricName='accuracy')

evaluator

MulticlassClassificationEvaluator_2c884964e7f0

In [76]:
# And this gives me the accuracy
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.8114754098360656
