In [8]:
from pyspark.sql import SparkSession
import repackage
from configuration import make_engine
import pandas as pd
import numpy as np

In [10]:
engine = make_engine()
engine.execute('show tables;').fetchall() # db table 확인

[('appl_stock',),
 ('college',),
 ('containsnull',),
 ('cruise_ship_info',),
 ('customer_churn',),
 ('dog_food',),
 ('ecommerce_customers',),
 ('fake_customers',),
 ('hack_data',),
 ('hello',),
 ('meal_info',),
 ('movielens_ratings',),
 ('new_customers',),
 ('sales_info',),
 ('seeds_dataset',),
 ('test',),
 ('titanic',),
 ('walmart_stock',)]

### data load

In [11]:
customer_churn = pd.read_sql("SELECT * FROM customer_churn", con=engine)

### create spark session

In [12]:
spark = SparkSession.builder.appName("customer").getOrCreate()
data = spark.createDataFrame(customer_churn)

### check schema

In [15]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: long (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: long (nullable = true)



### check out the data

In [16]:
data.describe().show()

+-------+-------------+-----------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|    Total_Purchase|   Account_Manager|             Years|         Num_Sites|       Onboard_date|            Location|             Company|              Churn|
+-------+-------------+-----------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|               900|               900|               900|               900|                900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.824033333332|0.4811111111111111| 5.273155555555555| 8.587777777777777|               null|                null|                null|0.16666666666666666|


### fit the model

In [26]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager',
                                       'Years', 'Num_Sites'],
                            outputCol='features')

output = assembler.transform(data)
final_data = output.select('features', 'churn')

In [28]:
train_churn, test_churn = final_data.randomSplit([0.7, 0.3])

In [29]:
from pyspark.ml.classification import LogisticRegression

In [30]:
lr_churn = LogisticRegression(featuresCol='features',
                              labelCol='churn')

In [32]:
fitted_churn_model = lr_churn.fit(train_churn)
training_summary = fitted_churn_model.summary

In [34]:
training_summary.predictions.describe().show()

+-------+------------------+-------------------+
|summary|             churn|         prediction|
+-------+------------------+-------------------+
|  count|               634|                634|
|   mean|0.1640378548895899|0.12145110410094637|
| stddev|0.3706023087280618| 0.3269086982224587|
|    min|               0.0|                0.0|
|    max|               1.0|                1.0|
+-------+------------------+-------------------+



### evaluate results

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[0.61126110062752...|[0.64822842248375...|       0.0|
|[29.0,8688.17,1.0...|    1|[2.76969480478950...|[0.94101604900235...|       0.0|
|[29.0,9378.24,0.0...|    0|[4.85936241023517...|[0.99230425668022...|       0.0|
|[29.0,9617.59,0.0...|    0|[4.50378220306607...|[0.98905407973509...|       0.0|
|[30.0,10744.14,1....|    1|[1.67753081298889...|[0.84257729315117...|       0.0|
|[30.0,12788.37,0....|    0|[2.42694319840579...|[0.91885891780370...|       0.0|
|[35.0,15571.26,0....|    0|[1.88618242962958...|[0.86831964054125...|       0.0|
|[36.0,10448.09,0....|    0|[4.85825337600409...|[0.99229578289374...|       0.0|
|[37.0,8284.89,1.0...|    1|[1.15336405718925...|[0.76012483919451...|       0.0|
|[37.0,10314.67,

### using AUC

In [37]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                           labelCol='churn')

In [39]:
auc = churn_eval.evaluate(pred_and_labels.predictions)
auc

0.7444664031620554

### predict on brand new unlabeled data

In [41]:
final_lr_model = lr_churn.fit(final_data)

In [42]:
new_customers = pd.read_sql("SELECT * FROM new_customers", con=engine)

In [43]:
new_customers = spark.createDataFrame(new_customers)

In [45]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: long (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: long (nullable = true)



In [46]:
test_new_customers = assembler.transform(new_customers)

In [47]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: long (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: long (nullable = true)
 |-- features: vector (nullable = true)



In [48]:
final_results = final_lr_model.transform(test_new_customers)

In [49]:
final_results.select('company', 'prediction').show()

+--------------------+----------+
|             company|prediction|
+--------------------+----------+
|       Castro-Hodges|       0.0|
|     Miller and Sons|       1.0|
|     Green-Gutierrez|       0.0|
|         Duncan-Bell|       0.0|
|        Mcdonald LLC|       1.0|
|          Stout-Reed|       0.0|
|Dominguez, Richar...|       0.0|
|          Reed-Davis|       0.0|
|        Briggs-Brown|       1.0|
|    Mullins and Sons|       0.0|
|Cox, Marshall and...|       0.0|
|Caldwell, Lane an...|       0.0|
|White, Jacobs and...|       0.0|
|Thompson, Thomas ...|       0.0|
|    Johnson-Copeland|       0.0|
|Robertson, Serran...|       0.0|
|      Lawson-Flowers|       0.0|
|          Travis Inc|       0.0|
|Mcgrath, Hebert a...|       0.0|
|        Cole-Hoffman|       1.0|
+--------------------+----------+
only showing top 20 rows

