## 1. Import and get spark instance

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (VectorAssembler,
                                VectorIndexer,
                                OneHotEncoder,
                                StringIndexer)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName('log_reg').getOrCreate()

## 2. Explore input data

In [3]:
df = spark.read.csv('titanic.csv', inferSchema=True, header=True)
df.show(3)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 3 rows



In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
print(f'shape: rows={df.count()}, cols={len(df.columns)}')

shape: rows=891, cols=12


In [6]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

#### Select columns to be used (Purpose: to predict "Survived" based on other columns)

In [7]:
cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

#### Get data of selected columns

In [8]:
df_selected = df.select(cols)
df_selected.show(3)

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 3 rows



#### Drop missing data

In [9]:
df_selected = df_selected.na.drop()
print(f'shape: rows={df_selected.count()}, cols={len(df_selected.columns)}')

shape: rows=712, cols=8


## 3. Transform categorical data to numerical data

#### Sex column: One hot encoding

In [10]:
# Change list of strings to its index
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
# Change to one hot encoding
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

#### Embarked column: One hot encoding

In [11]:
embarked_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embarked_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

## 4. Make features column and a Pipeline

In [12]:
assembler = VectorAssembler(
    inputCols=['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec'],
    outputCol='features'
)

In [13]:
log_reg = LogisticRegression(featuresCol='features',
                             labelCol='Survived')

#### Pipeline (consists of a sequence of stages and can treated it as a normal model)

In [14]:
pipeline = Pipeline(stages=[gender_indexer, embarked_indexer,
                            gender_encoder, embarked_encoder,
                            assembler,
                            log_reg])

## 5. Train, test split and fit  model

In [15]:
train_data, test_data = df_selected.randomSplit([0.8, 0.2]) # 80% and 20%

In [16]:
%%time
fit_model = pipeline.fit(train_data)

CPU times: user 46.9 ms, sys: 1.82 ms, total: 48.7 ms
Wall time: 8.6 s


In [17]:
results = fit_model.transform(dataset=test_data)
results.select(['Survived', 'prediction']).show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [18]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [19]:
AUC = my_eval.evaluate(results)
print(f'AUC = {AUC}')

AUC = 0.8148558758314857
