In [1]:
! pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/37/98/244399c0daa7894cdf387e7007d5e8b3710a79b67f3fd991c0b0b644822d/pyspark-2.4.3.tar.gz (215.6MB)
[K     |████████████████████████████████| 215.6MB 89kB/s 
[?25hCollecting py4j==0.10.7 (from pyspark)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 45.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
[?25h  Created wheel for pyspark: filename=pyspark-2.4.3-py2.py3-none-any.whl size=215965824 sha256=72338cf291fe88ecde

In [2]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
 
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['gender_submission.csv', 'test.csv', 'train.csv']


In [3]:
spark = SparkSession.builder.getOrCreate()
spark

# 1. Load data

In [4]:
sdf_train = spark.read.csv("../input/train.csv",inferSchema=True,header=True)
print(sdf_train.printSchema())
pdf = sdf_train.limit(5).toPandas()
pdf.T

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

None


Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [5]:
sdf_test = spark.read.csv("../input/test.csv",inferSchema=True,header=True)
# sdf_train.printSchema()
pdf = sdf_test.limit(5).toPandas()
pdf.T

Unnamed: 0,0,1,2,3,4
PassengerId,892,893,894,895,896
Pclass,3,3,2,3,3
Name,"Kelly, Mr. James","Wilkes, Mrs. James (Ellen Needs)","Myles, Mr. Thomas Francis","Wirz, Mr. Albert","Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
Sex,male,female,male,male,female
Age,34.5,47,62,27,22
SibSp,0,1,0,0,1
Parch,0,0,0,0,1
Ticket,330911,363272,240276,315154,3101298
Fare,7.8292,7,9.6875,8.6625,12.2875
Cabin,,,,,


# 2. Data cleanup

In [6]:
sdf_typecast = sdf_train.withColumn('Ticket', sdf_train['Ticket'].cast("double"))
sdf_typecast = sdf_typecast.fillna(0)
# pdf = sdf_typecast.limit(5).toPandas()
# pdf.T

# 3. Feature engineering

In [7]:
numeric_cols = ['PassengerId','Survived', 'Pclass','Age', 'SibSp','Parch','Ticket','Fare'] 
numeric_features = ['Pclass','Age', 'SibSp','Parch','Fare'] 
# string_features = [ 'Cabin', 'Embarked', 'Sex','Ticket']
# 'Name',
sdf_train_subset = sdf_typecast #.select(numeric_cols)    


In [8]:
_stages = []

In [9]:
from pyspark.ml.feature import VectorAssembler
assemblerInput = numeric_features # [f + '_vect' for f in string_features] + 
print(assemblerInput)
vectAssembler = VectorAssembler(inputCols  = assemblerInput, outputCol = "vect_features") #.fit(sdf_train_subset)  
_stages += [vectAssembler]
# handleInvalid = "keep" or "skip"

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


# 4. ML model

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier

# dt = DecisionTreeClassifier(labelCol = 'Survived', featuresCol = 'vect_features') # ,maxDepth=1
# _stages += [dt]

In [11]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol = 'Survived', featuresCol = 'vect_features', numTrees = 100, maxDepth = 4)
_stages += [rf]

In [12]:
_stages

[VectorAssembler_90754b0e1fe0, RandomForestClassifier_1a6394ae61af]

In [13]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = _stages)

In [14]:
model = pipeline.fit(sdf_train_subset)

In [15]:
numeric_cols_test = ['PassengerId', 'Pclass','Age', 'SibSp','Parch','Ticket','Fare'] 

sdf_test_subset = sdf_test.withColumn('Ticket', sdf_test['Ticket'].cast("double")). \
                        fillna(0). \
                        select(numeric_cols_test)

In [16]:
sdf_predict = model.transform(sdf_test_subset)

In [17]:
pdf = sdf_predict.limit(10).toPandas()
pdf.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
PassengerId,892,893,894,895,896,897,898,899,900,901
Pclass,3,3,2,3,3,3,3,2,3,3
Age,34.5,47,62,27,22,14,30,26,18,21
SibSp,0,1,0,0,1,0,0,1,0,2
Parch,0,0,0,0,1,0,0,1,0,0
Ticket,330911,363272,240276,315154,3.1013e+06,7538,330972,248738,2657,0
Fare,7.8292,7,9.6875,8.6625,12.2875,9.225,7.6292,29,7.2292,24.15
vect_features,"[3.0, 34.5, 0.0, 0.0, 7.8292]","[3.0, 47.0, 1.0, 0.0, 7.0]","[2.0, 62.0, 0.0, 0.0, 9.6875]","[3.0, 27.0, 0.0, 0.0, 8.6625]","[3.0, 22.0, 1.0, 1.0, 12.2875]","[3.0, 14.0, 0.0, 0.0, 9.225]","[3.0, 30.0, 0.0, 0.0, 7.6292]","[2.0, 26.0, 1.0, 1.0, 29.0]","[3.0, 18.0, 0.0, 0.0, 7.2292]","[3.0, 21.0, 2.0, 0.0, 24.15]"
rawPrediction,"[82.42631229488464, 17.573687705115383]","[84.19283156908863, 15.807168430911414]","[72.26057916523743, 27.73942083476257]","[78.53160862458768, 21.468391375412317]","[56.51719722672352, 43.48280277327648]","[75.54727694354735, 24.452723056452662]","[79.19153729949478, 20.808462700505206]","[34.96810065141485, 65.03189934858516]","[78.31035005112972, 21.68964994887027]","[69.9015460428122, 30.09845395718778]"
probability,"[0.8242631229488462, 0.1757368770511538]","[0.8419283156908859, 0.15807168430911409]","[0.7226057916523743, 0.2773942083476257]","[0.7853160862458768, 0.21468391375412316]","[0.5651719722672351, 0.4348280277327648]","[0.7554727694354735, 0.24452723056452663]","[0.791915372994948, 0.2080846270050521]","[0.3496810065141485, 0.6503189934858515]","[0.7831035005112973, 0.2168964994887027]","[0.6990154604281222, 0.3009845395718779]"


In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="prediction")
print("Test Area Under ROC: " + str(evaluator.evaluate(sdf_predict, {evaluator.metricName: "areaUnderROC"})))


Test Area Under ROC: 1.0


In [19]:
sdf_submission = sdf_predict.select('PassengerId','prediction').withColumn('Survived',sdf_predict['prediction'].cast('integer')).select('PassengerId','Survived')
sdf_submission.show()

+-----------+--------+
|PassengerId|Survived|
+-----------+--------+
|        892|       0|
|        893|       0|
|        894|       0|
|        895|       0|
|        896|       0|
|        897|       0|
|        898|       0|
|        899|       1|
|        900|       0|
|        901|       0|
|        902|       0|
|        903|       0|
|        904|       1|
|        905|       0|
|        906|       1|
|        907|       1|
|        908|       0|
|        909|       0|
|        910|       0|
|        911|       0|
+-----------+--------+
only showing top 20 rows



In [20]:
sdf_submission.coalesce(1).write.csv("submission",mode="overwrite",header=True)

In [21]:
print(os.listdir('submission'))

['.part-00000-bda957a1-8b19-4c65-83fb-80932933fa5a-c000.csv.crc', '._SUCCESS.crc', 'part-00000-bda957a1-8b19-4c65-83fb-80932933fa5a-c000.csv', '_SUCCESS']


<a href="submission/part-00000-b53a2b2f-1d11-459b-923b-a7231ed9a7d6-c000.csv"> Download File </a>

Further reading:   
https://spark.apache.org/docs/latest/mllib-decision-tree.html  
https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-trees  
https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier  