## Purpose of script:
#### Reviewing Spark's logistic regression implementation
#### Referencing Jose Portilla's "Spark and Python for Big Data with PySpark" course

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.appName('log_reg').getOrCreate()

In [8]:
df = spark.read.format('libsvm').load('../Datasets/sample_libsvm_data.txt')

In [11]:
train_data, test_data = df.randomSplit([0.7, 0.3])

print(train_data.count())
print(test_data.count())

68
32


In [4]:
log_reg = LogisticRegression()

In [12]:
log_reg_model = log_reg.fit(train_data)

In [13]:
log_reg_summary = log_reg_model.summary

In [14]:
log_reg_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [15]:
pred_and_labels = log_reg_model.evaluate(test_data)

In [17]:
pred_and_labels.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[23.6245110333501...|[0.99999999994504...|       0.0|
|  0.0|(692,[123,124,125...|[28.0540784022597...|[0.99999999999934...|       0.0|
|  0.0|(692,[126,127,128...|[25.9871574643765...|[0.99999999999482...|       0.0|
|  0.0|(692,[152,153,154...|[9.00906597634986...|[0.99987771892441...|       0.0|
|  0.0|(692,[152,153,154...|[11.5399036668907...|[0.99999026627015...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [19]:
my_eval = BinaryClassificationEvaluator()

In [21]:
my_roc = my_eval.evaluate(pred_and_labels.predictions)

In [22]:
# 1.0 represents a perfect fit on the test data
my_roc

1.0