In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
df = spark.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data.txt')

In [0]:
df.show()

In [0]:
logreg_model = LogisticRegression()

In [0]:
logreg_fit = logreg_model.fit(df)

In [0]:
log_summary = logreg_fit.summary

In [0]:
log_summary.predictions.printSchema()

In [0]:
llog_summary.predictions.show()

In [0]:
log_train, log_test = df.randomSplit([0.7,0.3])

In [0]:
final_model = LogisticRegression()

In [0]:
fit_final = final_model.fit(log_train)

In [0]:
preds_and_labels = fit_final.evaluate(log_test)

In [0]:
preds_and_labels.predictions.show()

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [0]:
eval = BinaryClassificationEvaluator()

In [0]:
final_roc = eval.evaluate(preds_and_labels.predictions)

In [0]:
final_roc

# 43. Logistic Regression Code Along

In [0]:
spark = spark.builder.appName('logreg_titanic').getOrCreate()

In [0]:
df = spark.read.csv('/FileStore/tables/titanic.csv',inferSchema=True,header=True)

In [0]:
df.printSchema()

In [0]:
df.columns

In [0]:
features_cols = df.select([ 'Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
                 'Embarked'])

In [0]:
df_final = features_cols.na.drop()

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [0]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='Sex_index')

In [0]:
gender_encoder = OneHotEncoder(inputCol='Sex_index',outputCol='Sex_vector')

In [0]:
embarked_indexer = StringIndexer(inputCol='Embarked',outputCol='Embarked_index')
embarked_encoder = OneHotEncoder(inputCol='Embarked_index',outputCol='Embarked_vector')

In [0]:
assembler = VectorAssembler(inputCols=['Pclass'
                                       ,'Sex_vector'
                                       ,'Embarked_vector'
                                       ,'Age'
                                       ,'SibSp'
                                       ,'Parch'
                                       ,'Fare'
                                      ]
                           ,outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
from pyspark.ml import Pipeline

In [0]:
logreg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [0]:
pipeline = Pipeline(stages = [gender_indexer
                             ,embarked_indexer
                             ,gender_encoder
                             ,embarked_encoder
                             ,assembler
                             ,logreg_titanic
                             ])

In [0]:
train_data, test_data = df_final.randomSplit([0.7, 0.3])

In [0]:
fit_model = pipeline.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [0]:
results.select('Survived','prediction').show()

In [0]:
AUC = eval.evaluate(results)

In [0]:
AUC