LOGISTIC REGRESSION

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LR").getOrCreate()

In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [3]:
data = spark.read.csv("titanic.csv", inferSchema=True, header=True)

In [4]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [6]:
data = data.select('Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked')

In [7]:
data.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
data.head(1)

[Row(Survived=0, Pclass=3, Sex='male', Age=22.0, SibSp=1, Parch=0, Fare=7.25, Embarked='S')]

In [9]:
final_data = data.na.drop()

In [11]:
from pyspark.ml.feature import (VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder)

In [12]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="Sex_index")
gender_encoder = OneHotEncoder(inputCol="Sex_index", outputCol="SexVec")

In [13]:
embarked_indexer = StringIndexer(inputCol="Embarked", outputCol= "Embarked_index")
embarked_encoder = OneHotEncoder(inputCol="Embarked_index", outputCol = "EmbarkedVec")

In [14]:
assembler = VectorAssembler(inputCols=["Pclass", "SexVec", "EmbarkedVec",
                                      "Age", "SibSp", "Parch", "Fare"], outputCol="features")

In [15]:
from pyspark.ml import Pipeline

In [16]:
log_reg_titanic = LogisticRegression(featuresCol="features", labelCol="Survived")

In [17]:
pipeline = Pipeline(stages=[gender_indexer, embarked_indexer, 
                            gender_encoder, embarked_encoder, 
                            assembler, log_reg_titanic])

In [18]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [19]:
train_data.describe().show()

+-------+------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|          Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|               490|               490|   490|               490|               490|                490|              490|     490|
|   mean|0.4142857142857143| 2.236734693877551|  null|29.733673469387757|0.4857142857142857|0.43673469387755104| 34.4830183673469|    null|
| stddev|0.4931016976786622|0.8369517868293843|  null| 14.87664410585465|0.8773449443230658| 0.8393916088614432|51.73987916949096|    null|
|    min|                 0|                 1|female|              0.42|                 0|                  0|              0.0|       C|
|    max|           

In [20]:
test_data.describe().show()

+-------+-------------------+------------------+------+------------------+------------------+-------------------+------------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|              Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+-------------------+------------------+--------+
|  count|                222|               222|   222|               222|               222|                222|               222|     222|
|   mean|0.38288288288288286|2.2477477477477477|  null|29.439954954954956|0.5765765765765766|0.42342342342342343| 34.75317117117118|    null|
| stddev|0.48718860552432597|0.8384802937751147|  null|13.638194719301435|1.0381810542238348| 0.8878215705694013|55.612846511394196|    null|
|    min|                  0|                 1|female|              0.75|                 0|                  0|               0.0|       C|
|    m

In [21]:
Log_Model = pipeline.fit(train_data)

In [22]:
results = Log_Model.transform(test_data)

In [23]:
results.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Sex_index: double (nullable = false)
 |-- Embarked_index: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkedVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [28]:
results.select("Survived", "prediction").show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [29]:
AUC = my_eval.evaluate(results)

In [30]:
AUC

0.787247745813654

In [31]:
evaluator = MulticlassClassificationEvaluator(labelCol='Survived', metricName='accuracy')

In [32]:
accuracy = evaluator.evaluate(results)

In [33]:
accuracy

0.8063063063063063

PREDICTIONS ON NEW DATA

In [None]:
full_model = 