In [1]:
sc

In [2]:
spark

#### 1. Read the dataset

In [5]:
churn_data = spark.read.csv(
    'file:///home/hadoop/Downloads/Telco_Customer_Churn.csv', 
    header=True,
    inferSchema=True)

In [6]:
churn_data.head()

Row(customerID='7590-VHVEG', gender='Female', SeniorCitizen=0, Partner='Yes', Dependents='No', tenure=1, PhoneService='No', MultipleLines='No phone service', InternetService='DSL', OnlineSecurity='No', OnlineBackup='Yes', DeviceProtection='No', TechSupport='No', StreamingTV='No', StreamingMovies='No', Contract='Month-to-month', PaperlessBilling='Yes', PaymentMethod='Electronic check', MonthlyCharges=29.85, TotalCharges='29.85', Churn='No')

#### 2. Data Exploration

a) how many customers records are in dataset

In [8]:
churn_data.count()

7043

In [9]:
len(churn_data.columns)

21

In [10]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

b) what is distribution of gender among customers?

In [12]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [14]:
churn_data.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



c) what is the distribution of contract type among customers?

In [15]:
churn_data.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|  

In [16]:
churn_data.groupBy(['Contract']).count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



d) What is the percentage of customers who go churned

In [23]:
churn_data.select(['churn']).where("churn = 'Yes'").count() / churn_data.count() * 100

26.536987079369588

#### 3. Data Preprocessing
- check for missing values and handle missing data.

In [27]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [39]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [36]:
churn_data = churn_data.withColumn('TotalCharges', when(col('TotalCharges') == " ", None)\
                      .otherwise(col('TotalCharges')))

In [38]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [40]:
churn_data1 = churn_data.na.drop()

In [43]:
from pyspark.sql.types import FloatType

In [45]:
churn_data1 = churn_data.withColumn('TotalCharges', col('TotalCharges').cast(FloatType()))

f) Convert categorical varibles into numerical format using one hot encoding or labe encoding

In [52]:
!pip3 install numpy pandas matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable


In [53]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

convert categorical data to numeric
- StringIndexer: 
    - label categorical columns
    - string to numerical
- oneHotEncoder:
    - for each row, only one value will be true, rest all are false
    - for each category a column will be created, and for row with the category, the value will be set to 1, others to 0

In [55]:
churn_data.dtypes

[('customerID', 'string'),
 ('gender', 'string'),
 ('SeniorCitizen', 'int'),
 ('Partner', 'string'),
 ('Dependents', 'string'),
 ('tenure', 'int'),
 ('PhoneService', 'string'),
 ('MultipleLines', 'string'),
 ('InternetService', 'string'),
 ('OnlineSecurity', 'string'),
 ('OnlineBackup', 'string'),
 ('DeviceProtection', 'string'),
 ('TechSupport', 'string'),
 ('StreamingTV', 'string'),
 ('StreamingMovies', 'string'),
 ('Contract', 'string'),
 ('PaperlessBilling', 'string'),
 ('PaymentMethod', 'string'),
 ('MonthlyCharges', 'double'),
 ('TotalCharges', 'string'),
 ('Churn', 'string')]

In [56]:
[item[0] for item in churn_data.dtypes if item[1] == 'string']

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges',
 'Churn']

In [66]:
categorical_cols = [i.name for i in churn_data.schema.fields if isinstance(i.dataType, StringType)]

In [67]:
print(categorical_cols)

['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']


In [69]:
stages = []

for catcols in categorical_cols[1:-1]:
    stringindexer = StringIndexer(
        inputCol=catcols, 
        outputCol=catcols + 'Index')
    onehotencoder = OneHotEncoderEstimator(
        inputCols=[stringindexer.getOutputCol()], 
        outputCols=[catcols + 'classVec'])
    stages += [stringindexer, onehotencoder]

[StringIndexer_7560f691a577, OneHotEncoderEstimator_20c4beb21dc5, StringIndexer_833cef9b884d, OneHotEncoderEstimator_10c63aabcfc5, StringIndexer_997904cd9fd5, OneHotEncoderEstimator_e642aeb6c6e4, StringIndexer_6dc25a44ab09, OneHotEncoderEstimator_1b4b62bdb3da, StringIndexer_f106e874baad, OneHotEncoderEstimator_c584c423db2e, StringIndexer_4cb9b7a33846, OneHotEncoderEstimator_e3300eb822cf, StringIndexer_8f162ada26b9, OneHotEncoderEstimator_cf85a3104527, StringIndexer_51b42f7182c2, OneHotEncoderEstimator_f927d2b3d206, StringIndexer_af8c1bca63b1, OneHotEncoderEstimator_b86a9b79b417, StringIndexer_9a6c33c1e419, OneHotEncoderEstimator_3b50a9250af3, StringIndexer_8c4b403ec919, OneHotEncoderEstimator_e889cc823bb7, StringIndexer_c61f205890dd, OneHotEncoderEstimator_a3d88f695424, StringIndexer_e1a7b5d15380, OneHotEncoderEstimator_bd6b7fcb78aa, StringIndexer_160c35da2c13, OneHotEncoderEstimator_6a20be54f459, StringIndexer_ddde21355978, OneHotEncoderEstimator_e6954556fe4a, StringIndexer_5dbf4a844f

#### 4. Import Mllib

In [None]:
from pyspark.ml