In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Project').getOrCreate()

In [0]:
df = spark.read.csv('/FileStore/tables/customer_churn.csv',inferSchema=True,header=True)

In [0]:
df.printSchema()

In [0]:
for item in df.head(1)[0]:
  print(item)

In [0]:
df.summary().show()

In [0]:
from pyspark.sql import functions as F

from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

In [0]:
# from pyspark.sql import functions as F
# from pyspark.sql import Window
# import sys

# w = Window.partitionBy(F.lit(1)).orderBy(F.lit(1)).rowsBetween(-sys.maxsize, 0)

# df.select("*",F.last('INSERT COLUMN NAME HERE',True).over(w).alias('newcol')).show()

In [0]:
df.columns

In [0]:
df_staging = (df
              .withColumn('year_Onboard_date',F.year('Onboard_date'))
              .withColumn('month_Onboard_date',F.month('Onboard_date'))
              .withColumn('dayofweek_Onboard_date',F.dayofweek('Onboard_date'))
              .withColumn('dayofmonth_Onboard_date',F.dayofmonth('Onboard_date'))
             )


In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [0]:
company_indexer = StringIndexer(inputCol='Company',outputCol='Company_index')

In [0]:
company_encoder = OneHotEncoder(inputCol='Company_index',outputCol='Company_vector')

In [0]:
assembler = VectorAssembler(inputCols=['Age'
                                       ,'Total_Purchase'
                                       ,'Account_Manager'
                                       ,'Years'
                                       ,'Num_Sites'
                                       ,'year_Onboard_date'
                                       ,'month_Onboard_date'
                                       ,'dayofweek_Onboard_date'
                                       ,'dayofmonth_Onboard_date'
                                       ,'Company_vector'
                                      ]
                           ,outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
logreg_churn = LogisticRegression(featuresCol='features',labelCol='Churn')

In [0]:
pipeline = Pipeline(stages = [company_indexer
                             ,company_encoder
                             ,assembler
                             ,logreg_churn
                             ])

In [0]:
train_data, test_data = df_staging.randomSplit([0.7, 0.3])

In [0]:
fit_model = pipeline.fit(train_data)

In [0]:
fit_model_analysis = fit_model.stages[-1]

fit_model_summary = fit_model_analysis.summary

fit_model_summary.predictions.describe().show()

In [0]:
predictions_test = fit_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
eval_train = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [0]:
eval_test = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [0]:
AUC = eval_test.evaluate(predictions_test)

In [0]:
AUC

In [0]:
logreg_model_final = pipeline.fit(df_staging)

In [0]:
new_customers = spark.read.csv('/FileStore/tables/new_customers.csv')

In [0]:
new_customers.printSchema()

In [0]:
new_customers_model = logreg_model_final.transform(new_customers)

In [0]:
new_customers_model.select('Company','prediction').show()