In [32]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [33]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
import numpy
from pyspark.mllib.linalg import Matrix, Matrices, Vectors, DenseMatrix, SparseVector

In [34]:
number_cores = 8
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [35]:
!dir ./data

gender_submission.csv  test.csv  train.csv


In [36]:
spark = pyspark.sql.SparkSession(sc)

In [37]:
df = spark.read.csv('./data/train.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



- PassengerID
- Sex (Women and children first)
- Age (Male > 10 considered adult)
- Cabin (Numbers increase back to front, front sank first; A-F in decreasing class order)
- PClass (higher classes were further away from boiler room/rising water)

In [38]:
df.take(5)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S')]

In [63]:
df.registerTempTable('df')

In [203]:
df1 = spark.sql("SELECT Survived, PassengerID, Pclass, Sex, Age, Cabin FROM df")

In [204]:
df1.show()

+--------+-----------+------+------+----+-----+
|Survived|PassengerID|Pclass|   Sex| Age|Cabin|
+--------+-----------+------+------+----+-----+
|       0|          1|     3|  male|22.0| null|
|       1|          2|     1|female|38.0|  C85|
|       1|          3|     3|female|26.0| null|
|       1|          4|     1|female|35.0| C123|
|       0|          5|     3|  male|35.0| null|
|       0|          6|     3|  male|null| null|
|       0|          7|     1|  male|54.0|  E46|
|       0|          8|     3|  male| 2.0| null|
|       1|          9|     3|female|27.0| null|
|       1|         10|     2|female|14.0| null|
|       1|         11|     3|female| 4.0|   G6|
|       1|         12|     1|female|58.0| C103|
|       0|         13|     3|  male|20.0| null|
|       0|         14|     3|  male|39.0| null|
|       0|         15|     3|female|14.0| null|
|       1|         16|     2|female|55.0| null|
|       0|         17|     3|  male| 2.0| null|
|       1|         18|     2|  male|null

In [205]:
unique_survivors = df.select('Survived').distinct().collect()
print(unique_survivors)

[Row(Survived=1), Row(Survived=0)]


In [206]:
columns = df1.columns
print(columns)

['Survived', 'PassengerID', 'Pclass', 'Sex', 'Age', 'Cabin']


In [207]:
cat_columns = ['Survived', 'PassengerID', 'Pclass', 'Sex', 'Age', 'Cabin']

In [306]:
cat_dictionary = {}

for c in cat_columns:
    unique_c = df1.select(c).distinct().collect()
    #print(c + ": " + str(len(unique_c)))
    cat_dictionary[c] = {}
    i = 0
    for v in unique_c:
        if c == "PassengerID":
            cat_dictionary[c][v[c]] = 0
        elif c == "Sex":
            cat_dictionary[c]['female'] = 1
            cat_dictionary[c]['male'] = 0
        elif c == "Age":
            cat_dictionary[c][v[c]] = 100-v[c]
        else:
            cat_dictionary[c][v[c]] = i
            i += 1

TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'

In [305]:
#df1.select("Age").distinct().collect()

In [None]:
# feature vector that we will multiply the weights to
print(cat_dictionary)

In [267]:
def dataPrep(r):
    key = 0
    value = []
    for c in columns:
        if c == 'Survived':
            key = cat_dictionary[c][r[columns.index(c)]]
        else:
            if c in cat_columns:
                value.append(cat_dictionary[c][r[columns.index(c)]])
            else:
                value.append(r[columns.index(c)])
    return LabeledPoint(key, value)

In [268]:
# values of each feature for each row
df_clean = df1.rdd.map(dataPrep)
df_clean.take(5)

[LabeledPoint(1.0, [0.0,1.0,0.0,48.0,45.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,64.0,74.0]),
 LabeledPoint(0.0, [0.0,1.0,1.0,81.0,45.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,12.0,48.0]),
 LabeledPoint(1.0, [0.0,1.0,0.0,12.0,45.0])]

In [269]:
columns.index("Survived")

0

In [270]:
# Split data for training set and testing set (in reality you want 3, training, testing, and validating)
df_svm = df_clean.randomSplit([0.8, 0.2], 1234)

In [271]:
print(df_clean.count())
print(df_svm[0].count())
print(df_svm[1].count())

891
710
181


In [272]:
svm_titanic = SVMWithSGD.train(df_svm[0], iterations=200)

In [273]:
svm_titanic.predict([0.0,1.0,1.0,48.0,45.0])

0

In [274]:
def testPrediction(p):
    prediction = svm_titanic.predict(p.features)
    if prediction == p.label:
        return ("correct", 1)
    else:
        return ("incorrect", 1)

In [275]:
df_results = df_svm[1].map(testPrediction).reduceByKey(lambda x, y: x + y)
df_results.collect()

[('incorrect', 88), ('correct', 93)]

In [278]:
93/(88+93)

0.5138121546961326

In [277]:
df_svm[0].sample(False, 0.01).collect()

[LabeledPoint(0.0, [0.0,0.0,1.0,40.0,52.0]),
 LabeledPoint(1.0, [0.0,1.0,0.0,42.0,45.0]),
 LabeledPoint(1.0, [0.0,2.0,0.0,11.0,45.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,24.0,124.0]),
 LabeledPoint(0.0, [0.0,2.0,1.0,69.0,45.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,13.0,14.0]),
 LabeledPoint(1.0, [0.0,1.0,0.0,13.0,45.0]),
 LabeledPoint(0.0, [0.0,2.0,1.0,68.0,45.0]),
 LabeledPoint(0.0, [0.0,2.0,0.0,82.0,45.0]),
 LabeledPoint(1.0, [0.0,2.0,0.0,59.0,45.0])]

In [25]:
sc.stop()