### Importing Liberaries

In [1]:
import warnings

# Import basic pandas and visualization liberaries
import pandas as pd
import seaborn as sns

# Import Classes from pyspark Liberaries
from pyspark.ml.classification import GBTClassifier, LogisticRegression

# Import eavluation from PySpark Liberaries
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Import features and tuning Liberaries
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, StandardScaler, Bucketizer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Import SparkSession for SparkSession
from pyspark.sql import SparkSession

In [2]:
# Hiding/Ignoring the warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
spark = SparkSession.builder.appName('Churner').getOrCreate()

In [4]:
df_sprk = spark.read.csv('churn2.csv',inferSchema=True,header=True)

In [5]:
df_sprk.show(10)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [13]:
print("Shape: ", (df_sprk.count(), len(df_sprk.columns)))

Shape:  (10000, 14)


In [14]:
df_sprk.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [20]:
df_sprk.describe(['Age','Tenure','Balance','NumOfProducts','EstimatedSalary','Exited']).toPandas() # Descriptive Stats

Unnamed: 0,summary,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited
0,count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
1,mean,38.9218,5.0128,76485.88928799961,1.5302,100090.2398809998,0.2037
2,stddev,10.487806451704587,2.892174377049684,62397.40520238599,0.5816543579989917,57510.49281769821,0.4027685839948606
3,min,18.0,0.0,0.0,1.0,11.58,0.0
4,max,92.0,10.0,250898.09,4.0,199992.48,1.0


In [9]:
df_sprk.groupby('Geography').count().toPandas()

Unnamed: 0,Geography,count
0,Germany,2509
1,France,5014
2,Spain,2477


In [10]:
df_sprk.groupby('exited').count().toPandas()

Unnamed: 0,exited,count
0,1,2037
1,0,7963


In [12]:
## Average tenure of people in the churn
df_sprk.groupby('exited').agg({'tenure':'mean'}).toPandas()

Unnamed: 0,exited,avg(tenure)
0,1,4.932744
1,0,5.033279


In [13]:
## Average balance of the churned
df_sprk.groupby('exited').agg({'balance':'mean'}).toPandas()

Unnamed: 0,exited,avg(balance)
0,1,91108.539337
1,0,72745.296779


In [14]:
## Average credit score of the churned
df_sprk.groupby('exited').agg({'creditscore':'mean'}).toPandas()

Unnamed: 0,exited,avg(creditscore)
0,1,645.351497
1,0,651.853196


In [16]:
df_sprk.groupby('exited').agg({'estimatedsalary':'mean'}).toPandas()

Unnamed: 0,exited,avg(estimatedsalary)
0,1,101465.677531
1,0,99738.391772
