In [1]:
import findspark

In [2]:
findspark.init('/home/sushant/spark-2.1.0-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('logReg').getOrCreate()

In [5]:
from pyspark.ml.classification import LogisticRegression

In [6]:
data = spark.read.format('libsvm').load('./Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/sample_libsvm_data.txt')

In [9]:
data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [10]:
logRegModel = LogisticRegression()

In [11]:
fittedLogRegModel = logRegModel.fit(data)

In [12]:
logRegModelSummary = fittedLogRegModel.summary

In [17]:
logRegModelSummary.predictions.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[19.8534775947478...|[0.99999999761359...|       0.0|
|  1.0|(692,[158,159,160...|[-20.377398194908...|[1.41321555111056...|       1.0|
|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865126979...|       1.0|
|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170303...|       1.0|
|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200604...|       1.0|
|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|
|  1.0|(692,[158,159,160...|[-20.337256674833...|[1.47109814695581...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102631...|       1.0|
|  0.0|(692,[154,155,156...|[19.2708803215613...|[0.99999999572670...|       0.0|
|  0.0|(692,[127

In [20]:
logRegModelSummary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [21]:
train, test = data.randomSplit([0.7, 0.3])

In [22]:
model = LogisticRegression()

In [24]:
fitModel = model.fit(train)

In [25]:
predictionAndLabels = fitModel.evaluate(test)

In [28]:
predictionAndLabels.predictions.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[124,125,126...|[31.0332571034844...|[0.99999999999996...|       0.0|
|  0.0|(692,[126,127,128...|[27.0510219452602...|[0.99999999999821...|       0.0|
|  0.0|(692,[129,130,131...|[15.7084490667890...|[0.99999984937148...|       0.0|
|  0.0|(692,[150,151,152...|[21.4585381998360...|[0.99999999952062...|       0.0|
|  0.0|(692,[152,153,154...|[14.4682882682260...|[0.99999947940257...|       0.0|
|  0.0|(692,[153,154,155...|[12.6199421180775...|[0.99999669457474...|       0.0|
|  0.0|(692,[234,235,237...|[6.46910976510908...|[0.99845179513320...|       0.0|
|  1.0|(692,[97,98,99,12...|[-18.236290579108...|[1.20248488445532...|       1.0|
|  1.0|(692,[99,100,101,...|[-8.5636914948915...|[1.90876795096115...|       1.0|
|  1.0|(692,[123

### Introducing evaluators

In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [30]:
testEval = BinaryClassificationEvaluator()

The metric BinaryClassificationEvaluator() ouputs is roc. The MulticlassClassificationEvaluator can output other things such as F1 score, weighted precission, weighted recall, accuracy etc.

In [34]:
testEval.evaluate(predictionAndLabels.predictions)
# perfect classification: not realistic!

1.0

### Titanic time

In [36]:
titanicData = spark.read.csv('./Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/titanic.csv', 
                            inferSchema=True, header=True)

In [37]:
titanicData.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 4 rows



In [38]:
titanicData.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [39]:
titanicData.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [40]:
titanicSelectCols = titanicData.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
 'Embarked'])

In [41]:
titanicSelectCols.describe().show()

+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|                891|               891|   891|               714|               891|                891|              891|     889|
|   mean| 0.3838383838383838| 2.308641975308642|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824| 32.2042079685746|    null|
| stddev|0.48659245426485753|0.8360712409770491|  null|14.526497332334035|1.1027434322934315| 0.8060572211299488|49.69342859718089|    null|
|    min|                  0|                 1|female|              0.42|                 0|                  0|              0.0|       C|
|    max|    

For now, let's drop the null values. 

In [42]:
titanicSelectnonNull = titanicSelectCols.na.drop()

In [44]:
from pyspark.ml.feature import (VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder)

In [45]:
genderIndexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

After StringIndexer, we gotta One Hot Encode.

Say, we have three index: A, B, C.

Say using StringIndexer: A -> 0, B -> 1, C -> 2. 

One hot encoder converts each example into arrays of 0 and 1.

For instance if our example is 1, then one hot encoder makes it: [1, 0, 0]

In [47]:
genderEncoder = OneHotEncoder(inputCol = 'SexIndex', outputCol = 'SexVec')

In [48]:
embarkIndexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embarkEncoder = StringIndexer(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [49]:
assembler = VectorAssembler(inputCols=['SexVec', 'EmbarkVec', 'PClass', 'Age', 'SibSp',
                                       'Parch', 'Fare'], 
                           outputCol='features')