In [29]:
from pyspark.sql import SparkSession, types
spark = SparkSession.builder.appName('logreconsult').getOrCreate()

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [41]:
schema_df = types.StructType([types.StructField('Names', types.StringType()),
                              types.StructField('Age', types.DoubleType()),
                              types.StructField('Total_Purchase', types.DoubleType()),
                              types.StructField('Account_Manager', types.IntegerType()), 
                              types.StructField('Years', types.DoubleType()),
                              types.StructField('Num_Sites', types.DoubleType()),
                              types.StructField('Onboard_date', types.TimestampType()),
                              types.StructField('Location', types.StringType()),
                              types.StructField('Company', types.StringType()),
                              types.StructField('Churn', types.IntegerType())])

df = spark.read.csv('../Arquivos/customer_churn.csv', schema_df, header=True)
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [18]:
df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [15]:
df.toPandas().describe()

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
count,900.0,900.0,900.0,900.0,900.0,900.0
mean,41.816667,10062.824033,0.481111,5.273156,8.587778,0.166667
std,6.12756,2408.644532,0.499921,1.274449,1.764836,0.372885
min,22.0,100.0,0.0,1.0,3.0,0.0
25%,38.0,8497.1225,0.0,4.45,7.0,0.0
50%,42.0,10045.87,0.0,5.215,8.0,0.0
75%,46.0,11760.105,1.0,6.11,10.0,0.0
max,65.0,18026.01,1.0,9.15,14.0,1.0


In [16]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [19]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'],
                            outputCol='features')

In [20]:
output = assembler.transform(df)

In [21]:
df = output.select('features', 'churn')

In [22]:
train_churn, test_churn = df.randomSplit([0.7, 0.3])

In [24]:
lr_churn = LogisticRegression(labelCol='churn')

In [25]:
fitted_churn_model = lr_churn.fit(train_churn)

In [26]:
training_sum = fitted_churn_model.summary

In [28]:
training_sum.predictions.describe().show();

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                627|                627|
|   mean|0.16267942583732056|0.11961722488038277|
| stddev| 0.3693676025615052|0.32477248889504956|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [30]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [32]:
pred_and_labels.predictions.show();

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[5.02870929600791...|[0.99349533203597...|       0.0|
|[26.0,8939.61,0.0...|    0|[7.25298295203407...|[0.99929244186386...|       0.0|
|[29.0,8688.17,1.0...|    1|[2.91288904978004...|[0.94847992320011...|       0.0|
|[29.0,9378.24,0.0...|    0|[5.46100148652585...|[0.99576868213210...|       0.0|
|[29.0,9617.59,0.0...|    0|[5.15082440178562...|[0.99423875857991...|       0.0|
|[29.0,11274.46,1....|    0|[4.80548587919856...|[0.99188172259261...|       0.0|
|[29.0,13240.01,1....|    0|[7.31929950461997...|[0.99933781256747...|       0.0|
|[29.0,13255.05,1....|    0|[4.46119569704069...|[0.98858329982652...|       0.0|
|[30.0,6744.87,0.0...|    0|[4.06309908609127...|[0.98309504593286...|       0.0|
|[30.0,7960.64,1

In [33]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [35]:
auc = churn_eval.evaluate(pred_and_labels.predictions);

In [36]:
auc

0.7820833333333332

## Predict on new data

In [37]:
new_lr_model = lr_churn.fit(df)

In [42]:
schema_df_new_customers = types.StructType([types.StructField('Names', types.StringType()),
                                            types.StructField('Age', types.DoubleType()),
                                            types.StructField('Total_Purchase', types.DoubleType()),
                                            types.StructField('Account_Manager', types.IntegerType()), 
                                            types.StructField('Years', types.DoubleType()),
                                            types.StructField('Num_Sites', types.DoubleType()),
                                            types.StructField('Onboard_date', types.TimestampType()),
                                            types.StructField('Location', types.StringType()),
                                            types.StructField('Company', types.StringType())])


df_new_customers = spark.read.csv('../Arquivos/new_customers.csv', schema_df_new_customers, header=True)
df_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [43]:
test_new_customers = assembler.transform(df_new_customers)

In [44]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [45]:
new_results = new_lr_model.transform(test_new_customers)

In [47]:
new_results.toPandas()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,features,rawPrediction,probability,prediction
0,Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd,"[37.0, 9935.53, 1.0, 7.71, 8.0]","[2.2216868057254757, -2.2216868057254757]","[0.9021801592176494, 0.09781984078235062]",0.0
1,Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME...",Cannon-Benson,"[23.0, 7526.94, 1.0, 9.28, 15.0]","[-6.22075399918457, 6.22075399918457]","[0.001983802597841992, 0.998016197402158]",1.0
2,Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views Lake Julialand, WY 63726-4298",Barron-Robertson,"[65.0, 100.0, 1.0, 1.0, 15.0]","[-3.7691606662874264, 3.7691606662874264]","[0.022551133124331425, 0.9774488668756686]",1.0
3,Megan Ferguson,32.0,6487.5,0,9.4,14.0,2016-10-28 05:32:13,"922 Wright Branch North Cynthialand, NC 64721",Sexton-Golden,"[32.0, 6487.5, 0.0, 9.4, 14.0]","[-5.095623136273915, 5.095623136273915]","[0.006086220767149351, 0.9939137792328506]",1.0
4,Taylor Young,32.0,13147.71,1,10.0,8.0,2012-03-20 00:36:46,Unit 0789 Box 0734 DPO AP 39702,Wood LLC,"[32.0, 13147.71, 1.0, 10.0, 8.0]","[1.104758063833092, -1.104758063833092]","[0.7511505614489966, 0.2488494385510034]",0.0
5,Jessica Drake,22.0,8445.26,1,3.46,14.0,2011-02-04 19:29:27,1148 Tina Stravenue Apt. 978 South Carlos TX 2...,Parks-Robbins,"[22.0, 8445.26, 1.0, 3.46, 14.0]","[-1.6896020251072734, 1.6896020251072734]","[0.1558281848636939, 0.8441718151363061]",1.0


In [49]:
new_results.select('Company', 'prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+



In [51]:
test_new_customers.toPandas().describe()

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites
count,6.0,6.0,6.0,6.0,6.0
mean,35.166667,7607.156667,0.833333,6.808333,12.333333
std,15.715173,4346.008233,0.408248,3.708738,3.386247
min,22.0,100.0,0.0,1.0,8.0
25%,25.25,6747.36,1.0,4.5225,9.5
50%,32.0,7986.1,1.0,8.495,14.0
75%,35.75,9562.9625,1.0,9.37,14.75
max,65.0,13147.71,1.0,10.0,15.0
