In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('logregconsult').getOrCreate()

In [None]:
data = spark.read.csv('customer_churn.csv', inferSchema = True, header = True)

In [None]:
data.printSchema()

In [None]:
data.describe().show()

In [None]:
data.columns

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], outputCol = 'features')

In [None]:
output = assembler.transform(data)

In [None]:
final_data = output.select('features', 'churn')

In [None]:
train_churn, test_churn = final_data.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr_churn = LogisticRegression(labelCol = 'churn')

In [None]:
fitted_churn_model = lr_churn.fit(train_churn)

In [None]:
training_sum = fitted_churn_model.summary

In [None]:
training_sum.predictions.describe().show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [None]:
pred_and_labels.predictions.show()

In [None]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'churn')

In [None]:
auc = churn_eval.evaluate(pred_and_labels.prediction)
auc

In [None]:
### predict on new data

In [None]:
final_lr_model = lr_churn.fit(final_data)

In [None]:
new_customers = spark.read.csv('new_customers.csv', inferSchema = True, header = True)

In [None]:
new_customers.printSchema()

In [None]:
test_new_customers = assembler.transform(new_customers)

In [None]:
test_new_customers.printSchema()

In [None]:
final_results = final_lr_model.transform(new_customers)
final_results

In [None]:
final_results.select('Company', 'prediction').show()

In [None]:
test_new_customers.describe().show()