In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [2]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
import numpy
from pyspark.mllib.linalg import Matrix, Matrices, Vectors, DenseMatrix, SparseVector

In [3]:
number_cores = 8
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [4]:
!dir ./data

gender_submission.csv  output1.csv  output.csv	test.csv  train.csv


In [5]:
spark = pyspark.sql.SparkSession(sc)

In [6]:
df = spark.read.csv('./data/train.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



- PassengerID
- Sex (Women and children first)
- Age (Male > 10 considered adult)
- Cabin (Numbers increase back to front, front sank first; A-F in decreasing class order)
- PClass (higher classes were further away from boiler room/rising water)

In [7]:
df.take(5)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S')]

In [8]:
df.registerTempTable('df')

In [16]:
df1 = spark.sql("SELECT Survived, PassengerID, Pclass, Sex, Age, Cabin FROM df")

In [17]:
df1.show()

+--------+-----------+------+------+----+-----+
|Survived|PassengerID|Pclass|   Sex| Age|Cabin|
+--------+-----------+------+------+----+-----+
|       0|          1|     3|  male|22.0| null|
|       1|          2|     1|female|38.0|  C85|
|       1|          3|     3|female|26.0| null|
|       1|          4|     1|female|35.0| C123|
|       0|          5|     3|  male|35.0| null|
|       0|          6|     3|  male|null| null|
|       0|          7|     1|  male|54.0|  E46|
|       0|          8|     3|  male| 2.0| null|
|       1|          9|     3|female|27.0| null|
|       1|         10|     2|female|14.0| null|
|       1|         11|     3|female| 4.0|   G6|
|       1|         12|     1|female|58.0| C103|
|       0|         13|     3|  male|20.0| null|
|       0|         14|     3|  male|39.0| null|
|       0|         15|     3|female|14.0| null|
|       1|         16|     2|female|55.0| null|
|       0|         17|     3|  male| 2.0| null|
|       1|         18|     2|  male|null

In [18]:
#unique_survivors = df.select('Survived').distinct().collect()
#print(unique_survivors)

In [19]:
columns = df1.columns
print(columns)

['Survived', 'PassengerID', 'Pclass', 'Sex', 'Age', 'Cabin']


In [20]:
cat_columns = ['Survived', 'PassengerID', 'Pclass', 'Sex', 'Age', 'Cabin']

In [21]:
import re
def myFunction(s):
    return (''.join(c for c in s if c.isdigit()) or None, 
            ''.join(c for c in s if c.isalpha()) or None)

Source: https://stackoverflow.com/questions/21917989/how-to-split-a-python-string-with-numbers-and-letters

In [22]:
cat_dictionary = {}

for c in cat_columns:
    unique_c = df1.select(c).distinct().collect()
    #print(c + ": " + str(len(unique_c)))
    cat_dictionary[c] = {}
    i = 0
    for v in unique_c:
        if c == "PassengerID":
            cat_dictionary[c][v[c]] = 0
        elif c == "Sex":
            cat_dictionary[c]['female'] = 5
            cat_dictionary[c]['male'] = 1
        elif c == "Age":
            if v[c] == None:
                cat_dictionary[c][v[c]] = 0
            elif v[c] < 10:
                cat_dictionary[c][v[c]] = 5
            else:
                cat_dictionary[c][v[c]] = 1
        elif c == "Pclass":
            cat_dictionary[c][v[c]] = 3-v[c]
        elif c == "Cabin":
            if v[c] == None:
                cat_dictionary[c][v[c]] = 0
            else:
                s = v[c].split()
                for i in s:
                    x = myFunction(i)
                    if x[0] == None:
                        cat_dictionary[c][v[c]] = 0
                    elif int(x[0]) < 36 and x[1] == 'A':
                        cat_dictionary[c][v[c]] = 3.5
                    elif int(x[0]) >= 36 and x[1] == 'A':
                        cat_dictionary[c][v[c]] = 2.5
                    elif int(x[0]) < 57 and x[1] == 'B':
                        cat_dictionary[c][v[c]] = 3
                    elif int(x[0]) >= 57 and x[1] == 'B':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) < 57 and x[1] == 'C':
                        cat_dictionary[c][v[c]] = 3
                    elif int(x[0]) >= 57 and x[1] == 'C':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) < 51 and x[1] == 'D':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) >= 51 and x[1] == 'D':
                        cat_dictionary[c][v[c]] = 1
                    elif int(x[0]) < 56 and x[1] == 'E':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) >= 56 and x[1] == 'E':
                        cat_dictionary[c][v[c]] = 1
                    elif int(x[0]) < 65 and x[1] == 'F':
                        cat_dictionary[c][v[c]] = 0.5
                    elif int(x[0]) >= 65 and x[1] == 'F':
                        cat_dictionary[c][v[c]] = 1
                    elif int(x[0]) < 40 and x[1] == 'G':
                        cat_dictionary[c][v[c]] = 0.5
                    elif int(x[0]) >= 56 and x[1] == 'G':
                        cat_dictionary[c][v[c]] = 1
                    else:
                        cat_dictionary[c][v[c]] = 0
        else:
            cat_dictionary[c][v[c]] = i
            i += 1

In [23]:
# feature vector that we will multiply the weights to
#print(cat_dictionary)

In [24]:
def dataPrep(r):
    key = 0
    value = []
    for c in columns:
        if c == 'Survived':
            key = cat_dictionary[c][r[columns.index(c)]]
        else:
            if c in cat_columns:
                value.append(cat_dictionary[c][r[columns.index(c)]])
            else:
                value.append(r[columns.index(c)])
    return LabeledPoint(key, value)

In [25]:
# values of each feature for each row
df_clean = df1.rdd.map(dataPrep)
df_clean.take(5)

[LabeledPoint(1.0, [0.0,0.0,1.0,1.0,0.0]),
 LabeledPoint(0.0, [0.0,2.0,5.0,1.0,2.0]),
 LabeledPoint(0.0, [0.0,0.0,5.0,1.0,0.0]),
 LabeledPoint(0.0, [0.0,2.0,5.0,1.0,2.0]),
 LabeledPoint(1.0, [0.0,0.0,1.0,1.0,0.0])]

In [26]:
#columns.index("Survived")

In [27]:
# Split data for training set and testing set (in reality you want 3, training, testing, and validating)
df_svm = df_clean.randomSplit([0.8, 0.2], 1234)

In [28]:
print(df_clean.count())
print(df_svm[0].count())
print(df_svm[1].count())

891
710
181


In [64]:
df_svm[0].take(1)

[LabeledPoint(1.0, [0.0,0.0,1.0,1.0,0.0])]

In [30]:
svm_titanic = SVMWithSGD.train(df_svm[0], iterations=200)

In [31]:
#svm_titanic.predict([0.0,1.0,1.0,48.0,45.0])

In [32]:
def testPrediction(p):
    prediction = svm_titanic.predict(p.features)
    if prediction == p.label:
        return ("correct", 1)
    else:
        return ("incorrect", 1)

In [33]:
testPrediction(LabeledPoint(1.0, [0.0,0.0,1.0,1.0,0.0]))

('correct', 1)

In [34]:
df_results = df_svm[0].map(testPrediction).reduceByKey(lambda x, y: x + y)
df_results.collect()

[('correct', 481), ('incorrect', 229)]

In [35]:
481/(481+229)

0.6774647887323944

In [36]:
def output(p):
    prediction = svm_titanic.predict(p.features)
    return ('-', prediction)

In [37]:
df_out = df_svm[0].map(lambda x: output(x))

In [57]:
#df_out.collect()

In [39]:
df_out1 = spark.createDataFrame(df_out, ['arb','result'])

In [40]:
df_out1.registerTempTable("df_out1")

In [41]:
df_out1.printSchema()

root
 |-- arb: string (nullable = true)
 |-- result: long (nullable = true)



In [42]:
out = spark.sql("SELECT result FROM df_out1")

In [58]:
#out.show()

In [44]:
out.registerTempTable("out")

In [59]:
#out.printSchema()

In [46]:
from pyspark.sql.functions import monotonically_increasing_id
new_df = out.withColumn("id", monotonically_increasing_id())

In [47]:
new_df.registerTempTable("new_df")

In [60]:
#new_df.show()

In [49]:
output_rdd = new_df.rdd.map(lambda x: (x[1]+1, x[0]))

In [61]:
#output_rdd.take(10)

In [51]:
output = spark.createDataFrame(output_rdd, ['PassengerID', 'Survived'])

In [62]:
#output.show()

In [54]:
output.write.csv('output1.csv')

In [73]:
df_test1 = spark.read.csv('./data/test.csv', header = True, inferSchema = True)

In [86]:
df_test1.registerTempTable('df_test1')

In [87]:
df_test1.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [88]:
df_test = spark.sql("SELECT PassengerID, Pclass, Sex, Age, Cabin FROM df_test1")

In [89]:
df_test.take(10)

[Row(PassengerID=892, Pclass=3, Sex='male', Age=34.5, Cabin=None),
 Row(PassengerID=893, Pclass=3, Sex='female', Age=47.0, Cabin=None),
 Row(PassengerID=894, Pclass=2, Sex='male', Age=62.0, Cabin=None),
 Row(PassengerID=895, Pclass=3, Sex='male', Age=27.0, Cabin=None),
 Row(PassengerID=896, Pclass=3, Sex='female', Age=22.0, Cabin=None),
 Row(PassengerID=897, Pclass=3, Sex='male', Age=14.0, Cabin=None),
 Row(PassengerID=898, Pclass=3, Sex='female', Age=30.0, Cabin=None),
 Row(PassengerID=899, Pclass=2, Sex='male', Age=26.0, Cabin=None),
 Row(PassengerID=900, Pclass=3, Sex='female', Age=18.0, Cabin=None),
 Row(PassengerID=901, Pclass=3, Sex='male', Age=21.0, Cabin=None)]

In [90]:
cat_columns = ['PassengerID', 'Pclass', 'Sex', 'Age', 'Cabin']

In [95]:
cat_dictionary = {}

for c in cat_columns:
    unique_c = df1.select(c).distinct().collect()
    #print(c + ": " + str(len(unique_c)))
    cat_dictionary[c] = {}
    i = 0
    for v in unique_c:
        if c == "PassengerID":
            cat_dictionary[c][v[c]] = 0
        elif c == "Sex":
            cat_dictionary[c]['female'] = 5
            cat_dictionary[c]['male'] = 1
        elif c == "Age":
            if v[c] == None:
                cat_dictionary[c][v[c]] = 0
            elif v[c] < 10:
                cat_dictionary[c][v[c]] = 5
            else:
                cat_dictionary[c][v[c]] = 1
        elif c == "Pclass":
            cat_dictionary[c][v[c]] = 3-v[c]
        elif c == "Cabin":
            if v[c] == None:
                cat_dictionary[c][v[c]] = 0
            else:
                s = v[c].split()
                for i in s:
                    x = myFunction(i)
                    if x[0] == None:
                        cat_dictionary[c][v[c]] = 0
                    elif int(x[0]) < 36 and x[1] == 'A':
                        cat_dictionary[c][v[c]] = 3.5
                    elif int(x[0]) >= 36 and x[1] == 'A':
                        cat_dictionary[c][v[c]] = 2.5
                    elif int(x[0]) < 57 and x[1] == 'B':
                        cat_dictionary[c][v[c]] = 3
                    elif int(x[0]) >= 57 and x[1] == 'B':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) < 57 and x[1] == 'C':
                        cat_dictionary[c][v[c]] = 3
                    elif int(x[0]) >= 57 and x[1] == 'C':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) < 51 and x[1] == 'D':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) >= 51 and x[1] == 'D':
                        cat_dictionary[c][v[c]] = 1
                    elif int(x[0]) < 56 and x[1] == 'E':
                        cat_dictionary[c][v[c]] = 2
                    elif int(x[0]) >= 56 and x[1] == 'E':
                        cat_dictionary[c][v[c]] = 1
                    elif int(x[0]) < 65 and x[1] == 'F':
                        cat_dictionary[c][v[c]] = 0.5
                    elif int(x[0]) >= 65 and x[1] == 'F':
                        cat_dictionary[c][v[c]] = 1
                    elif int(x[0]) < 40 and x[1] == 'G':
                        cat_dictionary[c][v[c]] = 0.5
                    elif int(x[0]) >= 56 and x[1] == 'G':
                        cat_dictionary[c][v[c]] = 1
                    else:
                        cat_dictionary[c][v[c]] = 0
        else:
            cat_dictionary[c][v[c]] = i
            i += 1

In [99]:
def testDataPrep(r):
    value = []
    for c in columns:
        if c in cat_columns:
            value.append(cat_dictionary[c][r[columns.index(c)]])
        else:
            value.append(r[columns.index(c)])
    return LabeledPoint(value)

In [100]:
df_test_clean = df_test.rdd.map(testDataPrep)

In [101]:
df_test_clean.take(10)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 291.0 failed 1 times, most recent failure: Lost task 0.0 in stage 291.0 (TID 5465, pcvm606-1.emulab.net, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/rdd.py", line 1440, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-99-e7465eb46516>", line 5, in testDataPrep
KeyError: 'male'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:154)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:154)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/rdd.py", line 1440, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-99-e7465eb46516>", line 5, in testDataPrep
KeyError: 'male'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:154)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [None]:
#sc.stop()