In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ChurnRateApp").getOrCreate()

In [271]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import col, countDistinct
from pyspark.sql.types import *
from pyspark.ml.feature import PCA

churnSchema = StructType([
    StructField("customerID", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("SeniorCitizen", IntegerType(), True),
    StructField("Partner", StringType(), True),
    StructField("Dependents", StringType(), True),
    StructField("tenure", IntegerType(), True),
    StructField("PhoneService", StringType(), True),
    StructField("MultipleLines", StringType(), True),
    StructField("InternetService", StringType(), True),
    StructField("OnlineSecurity", StringType(), True),
    StructField("OnlineBackup", StringType(), True),
    StructField("DeviceProtection", StringType(), True),
    StructField("TechSupport", StringType(), True),
    StructField("StreamingTV", StringType(), True),
    StructField("StreamingMovies", StringType(), True),
    StructField("Contract", StringType(), True),
    StructField("PaperlessBilling", StringType(), True),
    StructField("PaymentMethod", StringType(), True),
    StructField("MonthlyCharges", FloatType(), True),
    StructField("TotalCharges", FloatType(), True),
    StructField("Churn", StringType(), True),

    ])


churn_df = spark.read.csv("../data_sets/churn_rate_data.csv", header=True, schema=churnSchema)

#the data only has 11 nulls. consediring the small number these records will just be dropped.
print(churn_df.count())
print(churn_df.dropna().count())

churn_df = churn_df.dropna()

# change the label column to label
churn_df = churn_df.withColumnRenamed('Churn','label')

# List of columns need to be indexed and featurized
col_list = ["gender", "SeniorCitizen", 
              "Partner", "Dependents",
              "PhoneService", 
              "MultipleLines", "InternetService", 
              "OnlineSecurity", "OnlineBackup",
              "DeviceProtection", "TechSupport", 
              "StreamingTV", "StreamingMovies",
              "Contract", "PaperlessBilling", 
              "PaymentMethod"]
numerical_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

featurized_col_list = col_list + numerical_cols



7043
7032


In [272]:

# List of features and lable indexed 
indexers = [
    StringIndexer(inputCol=c, outputCol=f'{c}_indexed')
    for c in col_list
    ]
label_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
indexers.append(label_indexer)

#one hot encode the numeric
encoder = OneHotEncoder(inputCols = [f'{c}_indexed' for c in col_list], 
                        outputCols=[f'{c}_vector' for c in col_list])
encoder.setDropLast(False)

# Vectorizing the features
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(inputCols= numerical_cols + [f'{c}_vector' for c in col_list], outputCol="features").setHandleInvalid("skip")

# Define the pipline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers + [encoder, vector_assembler])

pipeline_model = pipeline.fit(churn_df)
tranformed_df = pipeline_model.transform(churn_df)




cols_drop = [f'{c}_indexed' for c in col_list] + [f'{c}_vector' for c in col_list] \
            + [f'{c}' for c in featurized_col_list] + ['customerID']
transformed_df = tranformed_df.drop(*cols_drop)



Unnamed: 0,label,labelIndex,features
0,No,0.0,"(1.0, 29.850000381469727, 29.850000381469727, ..."
1,No,0.0,"(34.0, 56.95000076293945, 1889.5, 1.0, 0.0, 1...."
2,Yes,1.0,"(2.0, 53.849998474121094, 108.1500015258789, 1..."
3,No,0.0,"(45.0, 42.29999923706055, 1840.75, 1.0, 0.0, 1..."
4,Yes,1.0,"(2.0, 70.69999694824219, 151.64999389648438, 0..."
5,Yes,1.0,"(8.0, 99.6500015258789, 820.5, 0.0, 1.0, 1.0, ..."
6,No,0.0,"(22.0, 89.0999984741211, 1949.4000244140625, 1..."
7,No,0.0,"(10.0, 29.75, 301.8999938964844, 0.0, 1.0, 1.0..."
8,Yes,1.0,"(28.0, 104.80000305175781, 3046.050048828125, ..."
9,No,0.0,"(62.0, 56.150001525878906, 3487.949951171875, ..."


In [273]:
# Features Scaling
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(tranformed_df)

scaled_df = scalerModel.transform(transformed_df)

In [274]:
#dimensionality reduction
#Still needs more work 

pca = PCA(k=45, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(scaled_df)
reduced_df = model.transform(scaled_df)

scaler = MinMaxScaler(inputCol="pcaFeatures", outputCol="scaledPcaFeatures")
scalerModel = scaler.fit(reduced_df)

scaled_pca_df = scalerModel.transform(reduced_df)


pca = PCA(k=45, inputCol="scaledFeatures", outputCol="pcaScaledFeatures")
model = pca.fit(scaled_pca_df)
pca_scaled_df = model.transform(scaled_pca_df)


In [275]:

seed = 11
(training,testing) = pca_scaled_df.randomSplit([0.8,0.2], seed=seed)

In [276]:
from pyspark.ml.classification import DecisionTreeClassifier

# train our model using training data
dt = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="features")
model = dt.fit(training)

# test our model and make predictions using testing data
predictions = model.transform(testing)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)



Test Error = 0.208453 
Accuracy = 0.791547 


In [277]:
from pyspark.ml.classification import RandomForestClassifier

rd = RandomForestClassifier(labelCol="labelIndex", featuresCol="features", maxDepth = 30)
model = rd.fit(training)
predictions = model.transform(testing)

evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)



Test Error = 0.199857 
Accuracy = 0.800143 


In [278]:
from pyspark.ml.classification import LogisticRegression


lr = LogisticRegression(labelCol="labelIndex", featuresCol="scaledFeatures", maxIter=30, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

predictions = lrModel.transform(testing)

evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)



Test Error = 0.270057 
Accuracy = 0.729943 


In [279]:
from pyspark.ml.classification import GBTClassifier

(training,testing) = pca_scaled_df.randomSplit([0.7,0.3], seed=seed)

# Train a GBT model.
gbt = GBTClassifier(labelCol="labelIndex", featuresCol="scaledFeatures", maxIter=30)

model = gbt.fit(training)

# Make predictions.
predictions = model.transform(testing)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)




Test Error = 0.198272
Accuracy = 0.801728 


In [280]:
from pyspark.ml.classification import LinearSVC

(training,testing) = pca_scaled_df.randomSplit([0.7,0.3], seed=seed)
lsvc = LinearSVC(labelCol="labelIndex", featuresCol="scaledFeatures", maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(training)

predictions = lsvcModel.transform(testing)

evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)



Test Error = 0.235718
Accuracy = 0.764282 
