## MLlib PySpark Logistic Regression Titanic Dataset

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Survived').getOrCreate()

In [0]:
df = spark.read.csv('dbfs:/FileStore/titanic.csv', inferSchema=True, header=True)

In [0]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [0]:
print((df.count(), len(df.columns)))

(891, 12)


In [0]:
from pyspark.sql.functions import col, sum as _sum

In [0]:
# check missing values
missing_val = df.select([_sum(col(c).isNull().cast('int')).alias(c) for c in df.columns])
missing_val.show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [0]:
# check duplicated
duplicate = df.exceptAll(df.dropDuplicates())
duplicate.show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [0]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
my_cols = df.select('Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked')

In [0]:
# drop missing values
final_data = my_cols.na.drop()

In [0]:
# check missing values
final_missing = final_data.select([_sum(col(c).isNull().cast('int')).alias(c) for c in final_data.columns])
final_missing.show()

+--------+------+---+---+-----+-----+----+--------+
|Survived|Pclass|Sex|Age|SibSp|Parch|Fare|Embarked|
+--------+------+---+---+-----+-----+----+--------+
|       0|     0|  0|  0|    0|    0|   0|       0|
+--------+------+---+---+-----+-----+----+--------+



In [0]:
final_data.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

## Handling Categorical columns

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

In [0]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_index')
gender_encoder = OneHotEncoder(inputCol='Sex_index', outputCol='SexVec')

In [0]:
embarked_indexer = StringIndexer(inputCol='Embarked',outputCol='Embarked_index')
embarked_encoder = OneHotEncoder(inputCol='Embarked_index', outputCol='EmbarkedVec')

In [0]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch',
                                        'Fare', 'EmbarkedVec'], 
                            outputCol='features')

In [0]:
scaler = StandardScaler(inputCol='features', outputCol='scalerfeatures', withStd=True, withMean=False)

## Create Model

In [0]:
from pyspark.ml.classification import LogisticRegression

## Pipeline

In [0]:
from pyspark.ml import Pipeline

In [0]:
log_reg_Model = LogisticRegression(featuresCol='scalerfeatures', labelCol='Survived')

In [0]:
pipe = Pipeline(stages=[gender_indexer, gender_encoder, embarked_indexer, embarked_encoder, assembler, scaler, log_reg_Model])

## Train Test Split

In [0]:
train, test = final_data.randomSplit([0.7, 0.3])

In [0]:
model = pipe.fit(train)

In [0]:
predictions = model.transform(test)

## Evaluation

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [0]:
predictions.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [0]:
AUC = eval.evaluate(predictions)

In [0]:
print('AUC  : ' +str(AUC))

AUC  : 0.8169746919746919


## Good job..!!