In [1]:
sc

In [2]:
spark

#### 1. Read the Dataset

In [4]:
churn_data = spark.read.csv("file:///home/hadoop/Downloads/Telco_Customer_Churn.csv",
                           header = True, inferSchema=True)
churn_data.head()

Row(customerID='7590-VHVEG', gender='Female', SeniorCitizen=0, Partner='Yes', Dependents='No', tenure=1, PhoneService='No', MultipleLines='No phone service', InternetService='DSL', OnlineSecurity='No', OnlineBackup='Yes', DeviceProtection='No', TechSupport='No', StreamingTV='No', StreamingMovies='No', Contract='Month-to-month', PaperlessBilling='Yes', PaymentMethod='Electronic check', MonthlyCharges=29.85, TotalCharges='29.85', Churn='No')

In [11]:
churn_data.show(5)

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|          No|No phone service|            DSL|            No|         Yes|              No|         No|    

#### 2. Data Exploration
    a. How many customer records are in the dataset ?

In [6]:
churn_data.count()

7043

In [7]:
len(churn_data.columns)

21

In [8]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

    b. What is the distribution of gender among customers?

In [9]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [10]:
churn_data.groupBy(['gender']).count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



    c. What is the distribution of contract types among customers?

In [12]:
churn_data.groupBy(['Contract']).count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



    d. What is percentage of Customers who churned ?

In [14]:
churn_data.select(['Churn']).where("Churn = 'Yes'").count()

1869

In [15]:
churn_data.select(['Churn']).where("Churn = 'Yes'").count()/churn_data.count() * 100

26.536987079369588

#### 3. Data Preprocessing
    * Check for Missing values and Handle missing data.

In [16]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [19]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns ]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [22]:
churn_data = churn_data.withColumn('TotalCharges', when(col('TotalCharges') == " ", None)\
                      .otherwise(col("TotalCharges")))

In [23]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns ]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [25]:
churn_data1 = churn_data.na.drop()

In [30]:
from pyspark.sql.types import FloatType

churn_data1 = churn_data1.withColumn('TotalCharges', col('TotalCharges').cast(FloatType()))

#### 4. Import Mllib
    
    f. Convert categorical variables into numerical format using one-hot encoding or label encoding.

In [58]:
churn_data1 = churn_data1.drop(col('customerID'))

In [33]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

In [34]:
print(churn_data1.columns)

['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [59]:
categorical_cols = [field.name for field in churn_data1.schema.fields \
            if isinstance(field.dataType,StringType)]

In [75]:
print(categorical_cols[:-1])

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [76]:
stages = []

for catcols in categorical_cols[:-1]:
    stringindexer = StringIndexer(inputCol= catcols, outputCol=catcols + "Index")
    onehotencoder = OneHotEncoderEstimator(inputCols=[stringindexer.getOutputCol()],
                                          outputCols=[catcols + "classVec"])
    stages += [stringindexer , onehotencoder]

In [77]:
[field.name for field in churn_data1.schema.fields \
            if not isinstance(field.dataType,StringType)]

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [78]:
numericalCols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [79]:
assemblerInputs = [c + "classVec" for c in categorical_cols[:-1]] + numericalCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [80]:
label_stringIdx = StringIndexer(inputCol='Churn', outputCol='label')

In [81]:
stages += [label_stringIdx]

In [82]:
pipeline = Pipeline(stages=stages)

In [83]:
preprocessing = pipeline.fit(churn_data1)

In [84]:
churn_df = preprocessing.transform(churn_data1)

In [87]:
churn_df.select(['Contract','ContractIndex','ContractclassVec']).show()

+--------------+-------------+----------------+
|      Contract|ContractIndex|ContractclassVec|
+--------------+-------------+----------------+
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      Two year|          1.0|   (2,[1],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      Two year|          1.0|   (2,[1],[1.0])|
|      One year|          2.0|       (2,[],[])|
|      Two year|          1.0|   (2,[1],

In [89]:
churn_df1 = churn_df.select(["features","label"])
churn_df1.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                        |label|
+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(30,[2,7,8,11,12,14,16,18,20,22,23,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                        |0.0  |
|(30,[0,1,2,3,4,7,9,10,13,14,16,18,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])                         |0.0  |
|(30,[0,1,2,3,4,7,9,11,12,14,16,18,20,22,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,53.85,108.1500015258789]) |1.0  |
|(30,[0,1,2,7,9,10,13,15,16,18,25,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1

#### Split Dataset into Train and Test

In [133]:
train, test = churn_df1.randomSplit([0.8, 0.2], seed = 42)

In [134]:
train.select(['features','label'])

DataFrame[features: vector, label: double]

#### Build Decision Tree ML Model

In [135]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = DecisionTreeClassifier(featuresCol='features', labelCol='label')
decision_model = tree.fit(train)

#### Evaluate the Model

In [136]:
predictions = decision_model.transform(test)

In [137]:
predictions.select(['label','probability','prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|1.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|1.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|0.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|0.0  |[0.35760517799352753,0.6423948220064725]|1.0       |
|0.0  |[0.35760517799352753,0.6423948220064725]|1.0       |
|0.0  |[0.35760517799352753,0.6423948220064725]|1.0       |
|0.0  |[0.35760517799352753,0.6423948220064725]|1.0       |
|1.0  |[0.6820388349514563,0.3179611650485437] |0.0       |
|1.0  |[0.35760517799352753,0.6423948220064725]|1.0       |
|1.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|1.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|1.0  |[0.11170212765957446,0.8882978723404256]|1.0       |
|0.0  |[0.11170212765957446,0.8882978723

In [138]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                             metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

0.8041085840058694

#### Build Random Forest ML Model

In [139]:
from pyspark.ml.classification import RandomForestClassifier
randomForest = RandomForestClassifier(featuresCol='features', labelCol='label')
rf_model = randomForest.fit(train)

In [140]:
predictions = rf_model.transform(test)

In [141]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                             metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

0.7945707997065297

#### Logistic Regression ML Model

In [142]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol='features', labelCol='label')
logit_model = logistic.fit(train)

In [143]:
predictions = logit_model.transform(test)

In [144]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                             metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

0.8129126925898753