In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [33]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
import numpy
from pyspark.mllib.linalg import Matrix, Matrices, Vectors, DenseMatrix, SparseVector

In [2]:
number_cores = 8
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [4]:
!dir ./data

gender_submission.csv  test.csv  train.csv


In [5]:
spark = pyspark.sql.SparkSession(sc)

In [10]:
df = spark.read.csv('./data/train.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



- PassengerID
- Sex (Women and children first)
- Age (Male > 10 considered adult)
- Cabin (Numbers increase back to front, front sank first; A-F in decreasing class order)
- PClass (higher classes were further away from boiler room/rising water)

In [11]:
df.take(5)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S')]

In [12]:
unique_survivors = df.select('Survived').distinct().collect()
print(unique_survivors)

[Row(Survived=1), Row(Survived=0)]


In [13]:
columns = df.columns
print(columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [15]:
cat_columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [16]:
cat_dictionary = {}

for c in cat_columns:
    unique_c = df.select(c).distinct().collect()
    #print(c + ": " + str(len(unique_c)))
    cat_dictionary[c] = {}
    i = 0
    for v in unique_c:
        cat_dictionary[c][v[c]] = i
        i += 1

In [17]:
# feature vector that we will multiply the weights to
print(cat_dictionary)

{'PassengerId': {148: 0, 463: 1, 471: 2, 496: 3, 833: 4, 243: 5, 392: 6, 540: 7, 623: 8, 737: 9, 858: 10, 31: 11, 516: 12, 85: 13, 137: 14, 251: 15, 451: 16, 580: 17, 808: 18, 65: 19, 458: 20, 879: 21, 883: 22, 53: 23, 255: 24, 481: 25, 588: 26, 799: 27, 804: 28, 133: 29, 296: 30, 472: 31, 853: 32, 78: 33, 322: 34, 513: 35, 321: 36, 362: 37, 613: 38, 633: 39, 673: 40, 857: 41, 375: 42, 593: 43, 597: 44, 876: 45, 108: 46, 155: 47, 683: 48, 744: 49, 34: 50, 193: 51, 211: 52, 368: 53, 530: 54, 642: 55, 796: 56, 101: 57, 115: 58, 126: 59, 756: 60, 772: 61, 81: 62, 385: 63, 830: 64, 847: 65, 874: 66, 28: 67, 183: 68, 210: 69, 436: 70, 497: 71, 596: 72, 762: 73, 300: 74, 406: 75, 412: 76, 784: 77, 787: 78, 842: 79, 587: 80, 731: 81, 76: 82, 667: 83, 688: 84, 723: 85, 26: 86, 27: 87, 332: 88, 501: 89, 577: 90, 626: 91, 384: 92, 831: 93, 44: 94, 159: 95, 192: 96, 271: 97, 606: 98, 743: 99, 811: 100, 844: 101, 253: 102, 806: 103, 103: 104, 236: 105, 329: 106, 460: 107, 12: 108, 336: 109, 350: 1

In [29]:
cat_dictionary['Survived'][1]

0

In [34]:
def dataPrep(r):
    key = 0
    value = []
    for c in columns:
        if c == 'Survived':
            key = cat_dictionary[c][r[columns.index(c)]]
        else:
            if c in cat_columns:
                value.append(cat_dictionary[c][r[columns.index(c)]])
            else:
                value.append(r[columns.index(c)])
    return LabeledPoint(key, value)

In [35]:
# values of each feature for each row
df_clean = df.rdd.map(dataPrep)
df_clean.take(5)

[LabeledPoint(1.0, [200.0,1.0,614.0,1.0,48.0,0.0,6.0,335.0,66.0,45.0,3.0]),
 LabeledPoint(0.0, [797.0,0.0,353.0,0.0,64.0,0.0,6.0,165.0,183.0,74.0,2.0]),
 LabeledPoint(0.0, [235.0,1.0,365.0,0.0,81.0,6.0,6.0,198.0,244.0,45.0,3.0]),
 LabeledPoint(0.0, [451.0,0.0,764.0,0.0,12.0,0.0,6.0,289.0,215.0,48.0,3.0]),
 LabeledPoint(1.0, [293.0,1.0,670.0,1.0,12.0,6.0,6.0,603.0,127.0,45.0,3.0])]

In [36]:
columns.index("Survived")

1

In [37]:
# Split data for training set and testing set (in reality you want 3, training, testing, and validating)
df_svm = df_clean.randomSplit([0.8, 0.2], 1234)

In [38]:
print(df_clean.count())
print(df_svm[0].count())
print(df_svm[1].count())

891
710
181


In [39]:
svm_titanic = SVMWithSGD.train(df_svm[0], iterations=200)

In [41]:
svm_titanic.predict([200.0,1.0,614.0,1.0,48.0,0.0,6.0,335.0,66.0,45.0,3.0])

0

In [44]:
def testPrediction(p):
    prediction = svm_titanic.predict(p.features)
    if prediction == p.label:
        return ("correct", 1)
    else:
        return ("incorrect", 1)

In [45]:
df_results = df_svm[1].map(testPrediction).reduceByKey(lambda x, y: x + y)
df_results.collect()

[('correct', 76), ('incorrect', 105)]

In [47]:
76/105

0.7238095238095238

In [48]:
df_svm[0].sample(False, 0.01).collect()

[LabeledPoint(1.0, [319.0,1.0,763.0,1.0,79.0,6.0,6.0,275.0,210.0,45.0,0.0]),
 LabeledPoint(1.0, [809.0,1.0,444.0,0.0,77.0,6.0,6.0,48.0,20.0,45.0,3.0]),
 LabeledPoint(1.0, [37.0,2.0,687.0,1.0,5.0,0.0,6.0,230.0,22.0,45.0,2.0]),
 LabeledPoint(1.0, [474.0,1.0,459.0,1.0,74.0,6.0,6.0,433.0,232.0,45.0,3.0]),
 LabeledPoint(1.0, [77.0,1.0,153.0,1.0,13.0,0.0,5.0,268.0,59.0,45.0,3.0]),
 LabeledPoint(0.0, [28.0,1.0,418.0,1.0,27.0,6.0,0.0,298.0,2.0,45.0,2.0]),
 LabeledPoint(0.0, [662.0,0.0,669.0,0.0,47.0,6.0,6.0,124.0,175.0,90.0,3.0])]

In [None]:
sc.stop()