# Pyspark Tutorial

## install packages

In [11]:
# !pip install scikit-learn
# !pip install pyspark
# !pip install wheel
# !pip install pandas

## Load data

In [86]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression

In [3]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [9]:
df_pyspark = spark.read.csv('data/titanic_train.csv')
df_pyspark.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|        _c0|     _c1|   _c2|                 _c3|   _c4|_c5|  _c6|  _c7|             _c8|    _c9| _c10|    _c11|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

In [10]:
df_pyspark = spark.read.csv('data/titanic_train.csv',header=True, inferSchema=True)
df_pyspark.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [11]:
df_pyspark.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [12]:
df_pyspark.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [19]:
df_pyspark.dtypes

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

## Preprocess

### Select column

In [27]:
df_pyspark.select('Name').show(3)

+--------------------+
|                Name|
+--------------------+
|Braund, Mr. Owen ...|
|Cumings, Mrs. Joh...|
|Heikkinen, Miss. ...|
+--------------------+
only showing top 3 rows



In [26]:
df_pyspark.select(['Name', 'Sex', 'age']).show(3)

+--------------------+------+----+
|                Name|   Sex| age|
+--------------------+------+----+
|Braund, Mr. Owen ...|  male|22.0|
|Cumings, Mrs. Joh...|female|38.0|
|Heikkinen, Miss. ...|female|26.0|
+--------------------+------+----+
only showing top 3 rows



### Add column

In [25]:
df_pyspark.select(['Name', 'Sex', 'age']).withColumn('Age_plus_2', df_pyspark['age']+2).show(3)

+--------------------+------+----+----------+
|                Name|   Sex| age|Age_plus_2|
+--------------------+------+----+----------+
|Braund, Mr. Owen ...|  male|22.0|      24.0|
|Cumings, Mrs. Joh...|female|38.0|      40.0|
|Heikkinen, Miss. ...|female|26.0|      28.0|
+--------------------+------+----+----------+
only showing top 3 rows



### Drop column

In [24]:
df_pyspark.select(['Name', 'Sex', 'age']).drop('age').show(3)

+--------------------+------+
|                Name|   Sex|
+--------------------+------+
|Braund, Mr. Owen ...|  male|
|Cumings, Mrs. Joh...|female|
|Heikkinen, Miss. ...|female|
+--------------------+------+
only showing top 3 rows



### Rename Column

In [23]:
df_pyspark.select(['Name', 'Sex', 'age']).withColumnRenamed('Name', 'NewName').show(3)

+--------------------+------+----+
|             NewName|   Sex| age|
+--------------------+------+----+
|Braund, Mr. Owen ...|  male|22.0|
|Cumings, Mrs. Joh...|female|38.0|
|Heikkinen, Miss. ...|female|26.0|
+--------------------+------+----+
only showing top 3 rows



### Drop na

In [33]:
df_pyspark.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [36]:
df_pyspark.na.drop(how='any').describe().show()

+-------+------------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+-----------------+-----------------+-----+--------+
|summary|       PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|           Ticket|             Fare|Cabin|Embarked|
+-------+------------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+-----------------+-----------------+-----+--------+
|  count|               183|                183|               183|                 183|   183|               183|               183|                183|              183|              183|  183|     183|
|   mean| 455.3661202185792| 0.6721311475409836|1.1912568306010929|                null|  null|  35.6744262295082|0.4644808743169399|0.47540983606557374|82214.70992366412|78.682468

## Feature engineer

### Fill na with specific value

In [45]:
df_pyspark.na.fill('Missing Values').show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+--------------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|         Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+--------------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25|Missing Values|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|           C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925|Missing Values|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1|          C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05|Missing Values|       S|
|       

### Fill na with statistic value

In [46]:
imputer = Imputer(inputCols=['Age', 'Fare'],
                 outputCols=['Age_imputed', 'Fare_imputed']).setStrategy('mean')
imputer.fit(df_pyspark).transform(df_pyspark).show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----------------+------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|      Age_imputed|Fare_imputed|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----------------+------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|             22.0|        7.25|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|             38.0|     71.2833|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|             26.0|       7.925|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|          

### Filter operation - single rule

In [49]:
df_pyspark.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [50]:
df_pyspark.filter(df_pyspark['Age']>=30).show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|PC 17599|71.2833|  C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|  113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|  373450|   8.05| null|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|   17463|51.8625|  E46|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|  113783|  26.55| C103|       S|
|         14|       0|     3|Andersson, Mr. An...|  male|39.0|    1|    5|  347082| 31.275| null|       S|
|         16|       1|     2|Hewlett,

### Filter operation - and

In [54]:
df_pyspark.filter((df_pyspark['Age']>=30)&(df_pyspark['Pclass']=='1')).show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|PC 17599|71.2833|  C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|  113803|   53.1| C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|   17463|51.8625|  E46|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|  113783|  26.55| C103|       S|
|         31|       0|     1|Uruchurtu, Don. M...|  male|40.0|    0|    0|PC 17601|27.7208| null|       C|
|         36|       0|     1|Holverson, Mr. Al...|  male|42.0|    1|    0|  113789|   52.0| null|       S|
|         53|       1|     1|Harper, 

### Filter operation - or

In [57]:
df_pyspark.filter((df_pyspark['Age']<=10) | (df_pyspark['Age']>=50)).show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+-------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|       Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-------------+-------+-----+--------+
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|        17463|51.8625|  E46|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|       349909| 21.075| null|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|      PP 9549|   16.7|   G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|       113783|  26.55| C103|       S|
|         16|       1|     2|Hewlett, Mrs. (Ma...|female|55.0|    0|    0|       248706|   16.0| null|       S|
|         17|       0|     3|Rice, Master. Eugene|  male| 2.0|    4|    1|       382652| 29.125| null|  

### Filter operation - not

In [58]:
df_pyspark.filter(~(df_pyspark['Age']>=30)).show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736|30.0708| null|       C|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|      

### groupby - sum

In [60]:
df_pyspark.groupBy('Pclass').sum().show()

+------+----------------+-------------+-----------+--------+----------+----------+------------------+
|Pclass|sum(PassengerId)|sum(Survived)|sum(Pclass)|sum(Age)|sum(SibSp)|sum(Parch)|         sum(Fare)|
+------+----------------+-------------+-----------+--------+----------+----------+------------------+
|     1|           99705|          136|        216| 7111.42|        90|        77|18177.412499999984|
|     3|          215625|          119|       1473| 8924.92|       302|       193| 6714.695100000002|
|     2|           82056|           87|        368| 5168.83|        74|        70|3801.8416999999995|
+------+----------------+-------------+-----------+--------+----------+----------+------------------+



### groupBy - mean

In [61]:
df_pyspark.groupBy('Pclass').mean().show()

+------+------------------+-------------------+-----------+------------------+-------------------+-------------------+------------------+
|Pclass|  avg(PassengerId)|      avg(Survived)|avg(Pclass)|          avg(Age)|         avg(SibSp)|         avg(Parch)|         avg(Fare)|
+------+------------------+-------------------+-----------+------------------+-------------------+-------------------+------------------+
|     1|461.59722222222223| 0.6296296296296297|        1.0|38.233440860215055| 0.4166666666666667|0.35648148148148145| 84.15468749999992|
|     3| 439.1547861507128|0.24236252545824846|        3.0| 25.14061971830986|  0.615071283095723|0.39307535641547864|13.675550101832997|
|     2|445.95652173913044|0.47282608695652173|        2.0| 29.87763005780347|0.40217391304347827| 0.3804347826086957| 20.66218315217391|
+------+------------------+-------------------+-----------+------------------+-------------------+-------------------+------------------+



### groupBy - count

In [62]:
df_pyspark.groupBy('Pclass').count().show()

+------+-----+
|Pclass|count|
+------+-----+
|     1|  216|
|     3|  491|
|     2|  184|
+------+-----+



### agg

In [63]:
df_pyspark.agg({'Pclass':'sum'}).show()

+-----------+
|sum(Pclass)|
+-----------+
|       2057|
+-----------+



In [64]:
df_pyspark.groupBy('Pclass').agg({'age':'mean'}).show()

+------+------------------+
|Pclass|          avg(age)|
+------+------------------+
|     1|38.233440860215055|
|     3| 25.14061971830986|
|     2| 29.87763005780347|
+------+------------------+



## Modeling

### SrtingIndexer

In [79]:
index = StringIndexer(inputCol='Sex', outputCol='Sex_index')
df = index.fit(df_pyspark)
df= df.transform(df_pyspark)
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Sex_index|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|      0.0|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|      1.0|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|      1.0|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|      1.0|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|      0.0|


In [80]:
imputer = Imputer(inputCols=['Age'],
                 outputCols=['Age_imputed']).setStrategy('median')
df = imputer.fit(df).transform(df)

In [82]:
df.select(['Pclass', 'Sex_index', 'Age_imputed']).describe().show()

+-------+------------------+-------------------+------------------+
|summary|            Pclass|          Sex_index|       Age_imputed|
+-------+------------------+-------------------+------------------+
|  count|               891|                891|               891|
|   mean| 2.308641975308642|0.35241301907968575| 29.36158249158249|
| stddev|0.8360712409770491| 0.4779900708960982|13.019696550973201|
|    min|                 1|                0.0|              0.42|
|    max|                 3|                1.0|              80.0|
+-------+------------------+-------------------+------------------+



In [85]:
featureassembler = VectorAssembler(inputCols=['Pclass', 'Sex_index', 'Age_imputed'], outputCol='Independent Features')
output = featureassembler.transform(df)
output.select(['Pclass', 'Sex_index', 'Age_imputed', 'Independent Features']).show(10)

+------+---------+-----------+--------------------+
|Pclass|Sex_index|Age_imputed|Independent Features|
+------+---------+-----------+--------------------+
|     3|      0.0|       22.0|      [3.0,0.0,22.0]|
|     1|      1.0|       38.0|      [1.0,1.0,38.0]|
|     3|      1.0|       26.0|      [3.0,1.0,26.0]|
|     1|      1.0|       35.0|      [1.0,1.0,35.0]|
|     3|      0.0|       35.0|      [3.0,0.0,35.0]|
|     3|      0.0|       28.0|      [3.0,0.0,28.0]|
|     1|      0.0|       54.0|      [1.0,0.0,54.0]|
|     3|      0.0|        2.0|       [3.0,0.0,2.0]|
|     3|      1.0|       27.0|      [3.0,1.0,27.0]|
|     2|      1.0|       14.0|      [2.0,1.0,14.0]|
+------+---------+-----------+--------------------+
only showing top 10 rows



In [87]:
finalized_data = output.select(['Independent Features', 'Survived'])
finalized_data.show()

+--------------------+--------+
|Independent Features|Survived|
+--------------------+--------+
|      [3.0,0.0,22.0]|       0|
|      [1.0,1.0,38.0]|       1|
|      [3.0,1.0,26.0]|       1|
|      [1.0,1.0,35.0]|       1|
|      [3.0,0.0,35.0]|       0|
|      [3.0,0.0,28.0]|       0|
|      [1.0,0.0,54.0]|       0|
|       [3.0,0.0,2.0]|       0|
|      [3.0,1.0,27.0]|       1|
|      [2.0,1.0,14.0]|       1|
|       [3.0,1.0,4.0]|       1|
|      [1.0,1.0,58.0]|       1|
|      [3.0,0.0,20.0]|       0|
|      [3.0,0.0,39.0]|       0|
|      [3.0,1.0,14.0]|       0|
|      [2.0,1.0,55.0]|       1|
|       [3.0,0.0,2.0]|       0|
|      [2.0,0.0,28.0]|       1|
|      [3.0,1.0,31.0]|       0|
|      [3.0,1.0,28.0]|       1|
+--------------------+--------+
only showing top 20 rows



In [90]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

In [91]:
clf = LogisticRegression(featuresCol='Independent Features', labelCol='Survived')
clf = clf.fit(train_data)

21/11/20 15:25:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
21/11/20 15:25:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


## Evaluation

In [92]:
clf.coefficients

DenseVector([-1.0426, 2.5636, -0.0352])

In [94]:
clf.intercept

1.9526384063100308

In [95]:
pred_result = clf.evaluate(test_data)
pred_result.predictions.show()

+--------------------+--------+--------------------+--------------------+----------+
|Independent Features|Survived|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|      [1.0,0.0,0.92]|       1|[-0.8775890198384...|[0.29367764224280...|       1.0|
|       [1.0,0.0,4.0]|       1|[-0.7690429538267...|[0.31668617092254...|       1.0|
|      [1.0,0.0,19.0]|       0|[-0.2404095154583...|[0.44018543455482...|       1.0|
|      [1.0,0.0,21.0]|       0|[-0.1699250570092...|[0.45762066031601...|       1.0|
|      [1.0,0.0,22.0]|       0|[-0.1346828277846...|[0.46638009827952...|       1.0|
|      [1.0,0.0,23.0]|       1|[-0.0994405985600...|[0.47516031578271...|       1.0|
|      [1.0,0.0,27.0]|       1|[0.04152831833815...|[0.51038058776288...|       0.0|
|      [1.0,0.0,28.0]|       0|[0.07677054756271...|[0.51918321611216...|       0.0|
|      [1.0,0.0,28.0]|       0|[0.07677054756271...|[0.5191832161



In [96]:
pred_result.accuracy

0.7830188679245284

In [97]:
pred_result.areaUnderROC

0.8637287151702787

In [99]:
pred_result.predictions.toPandas()



Unnamed: 0,Independent Features,Survived,rawPrediction,probability,prediction
0,"[1.0, 0.0, 0.92]",1,"[-0.8775890198384431, 0.8775890198384431]","[0.29367764224280035, 0.7063223577571996]",1.0
1,"[1.0, 0.0, 4.0]",1,"[-0.76904295382679, 0.76904295382679]","[0.31668617092254325, 0.6833138290774567]",1.0
2,"[1.0, 0.0, 19.0]",0,"[-0.24040951545835054, 0.24040951545835054]","[0.44018543455482273, 0.5598145654451773]",1.0
3,"[1.0, 0.0, 21.0]",0,"[-0.16992505700922522, 0.16992505700922522]","[0.45762066031601295, 0.5423793396839871]",1.0
4,"[1.0, 0.0, 22.0]",0,"[-0.13468282778466256, 0.13468282778466256]","[0.46638009827952354, 0.5336199017204765]",1.0
...,...,...,...,...,...
207,"[3.0, 1.0, 31.0]",1,"[-0.29584597611131147, 0.29584597611131147]","[0.4265732815155871, 0.5734267184844128]",1.0
208,"[3.0, 1.0, 32.0]",0,"[-0.2606037468867488, 0.2606037468867488]","[0.4352152996353439, 0.5647847003646561]",1.0
209,"[3.0, 1.0, 33.0]",1,"[-0.22536151766218615, 0.22536151766218615]","[0.44389686612987, 0.55610313387013]",1.0
210,"[3.0, 1.0, 39.0]",0,"[-0.013908142314810412, 0.013908142314810412]","[0.4965230204689902, 0.5034769795310098]",1.0
