# Predicting survival of passengers in Titanic dataset
Link: https://www.kaggle.com/c/titanic

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("spark://localhost:7077").appName("Predicting survivals in titanic").getOrCreate()

In [3]:
train_data = spark.read.csv("file:///home/user/titanic_dataset/train.csv", header=True, inferSchema=True)

In [4]:
test_data = spark.read.csv("file:///home/user/titanic_dataset/test.csv", header=True, inferSchema=True)

In [5]:
combine_data = train_data.drop('Survived').unionByName(test_data)

In [6]:
combine_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
combine_data.cache()

DataFrame[PassengerId: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

In [8]:
combine_data.show(10, False)

+-----------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Pclass|Name                                               |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|1          |3     |Braund, Mr. Owen Harris                            |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |
|2          |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |3     |Heikkinen, Miss. Laina                             |female|26.0|0    |0    |STON/O2. 3101282|7.925  |null |S       |
|4          |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |female|35.0|1    |0    |113803          |53.1   |C123 |S       |
|5          |3     |Allen, Mr. William He

In [9]:
from pyspark.sql import functions as F

In [10]:
combine_data = combine_data.withColumn('Title',F.regexp_extract(F.col('Name'), '.*,\ (\w+).*', 1))

In [11]:
combine_data = combine_data.drop('Name','Ticket','Fare','Cabin')

In [12]:
columns = combine_data.columns

In [13]:
def countNull(df,var):
    return df.where(df[var].isNull()).count()

In [14]:
missing = {var: countNull(combine_data,var) for var in columns}

In [15]:
missing

{'Age': 263,
 'Embarked': 2,
 'Parch': 0,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 0,
 'Title': 0}

In [16]:
avg_age = combine_data.groupBy().agg(F.avg('Age')).first()[0]

In [17]:
combine_data = combine_data.na.fill({'Age': avg_age})

In [18]:
combine_data = combine_data.na.fill({'Embarked': 'S'})

In [19]:
combine_data = combine_data.withColumn('Family', F.col('SibSp')+F.col('Parch'))

In [20]:
train = combine_data.filter(combine_data['PassengerId']<= 891)
test = combine_data.filter(combine_data['PassengerId'] > 891)

In [21]:
train = train.join(train_data.select('PassengerId','Survived'), ['PassengerId'] )

In [22]:
train.show()

+-----------+------+------+------------------+-----+-----+--------+------+------+--------+
|PassengerId|Pclass|   Sex|               Age|SibSp|Parch|Embarked| Title|Family|Survived|
+-----------+------+------+------------------+-----+-----+--------+------+------+--------+
|          1|     3|  male|              22.0|    1|    0|       S|    Mr|     1|       0|
|          2|     1|female|              38.0|    1|    0|       C|   Mrs|     1|       1|
|          3|     3|female|              26.0|    0|    0|       S|  Miss|     0|       1|
|          4|     1|female|              35.0|    1|    0|       S|   Mrs|     1|       1|
|          5|     3|  male|              35.0|    0|    0|       S|    Mr|     0|       0|
|          6|     3|  male|29.881137667304014|    0|    0|       Q|    Mr|     0|       0|
|          7|     1|  male|              54.0|    0|    0|       S|    Mr|     0|       0|
|          8|     3|  male|               2.0|    3|    1|       S|Master|     4|       0|

In [23]:
missing = {var: countNull(combine_data,var) for var in columns}
missing

{'Age': 0,
 'Embarked': 0,
 'Parch': 0,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 0,
 'Title': 0}

In [24]:
combine_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = false)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Embarked: string (nullable = false)
 |-- Title: string (nullable = true)
 |-- Family: integer (nullable = true)



In [25]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [26]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexEnc')

In [27]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkEnc')

In [28]:
title_indexer = StringIndexer(inputCol='Title',outputCol='TitleIndex')
title_encoder = OneHotEncoder(inputCol='TitleIndex',outputCol='TitleEnc')

In [29]:
pclass_encoder = OneHotEncoder(inputCol='Pclass',outputCol='PclassEnc')

In [30]:
assembler = VectorAssembler(inputCols=['PclassEnc',
 'SexEnc',
 'Age',
 'Family',
 'EmbarkEnc',
 'TitleEnc'],outputCol='features')

In [31]:
from pyspark.ml.classification import LogisticRegression

In [32]:
from pyspark.ml import Pipeline

In [33]:
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [34]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           title_indexer, title_encoder,
                           pclass_encoder,
                           assembler,log_reg_titanic])

In [35]:
fit_model = pipeline.fit(train)

In [36]:
results = fit_model.transform(test)

In [37]:
results.select('PassengerId','prediction').show()

+-----------+----------+
|PassengerId|prediction|
+-----------+----------+
|        892|       0.0|
|        893|       1.0|
|        894|       0.0|
|        895|       0.0|
|        896|       1.0|
|        897|       0.0|
|        898|       1.0|
|        899|       0.0|
|        900|       1.0|
|        901|       0.0|
|        902|       0.0|
|        903|       0.0|
|        904|       1.0|
|        905|       0.0|
|        906|       1.0|
|        907|       1.0|
|        908|       0.0|
|        909|       0.0|
|        910|       1.0|
|        911|       1.0|
+-----------+----------+
only showing top 20 rows

