In [1]:
!pip install pyspark



In [10]:
import warnings
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, round
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier,GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# Filter out warnings
warnings.filterwarnings("ignore")

# Create a Spark session
spark = SparkSession.builder \
    .appName("ubo") \
    .config("spark.logConf", "false")\
    .getOrCreate()

# Set the log level to ERROR or FATAL. This is to suppress the warnings.
spark.sparkContext.setLogLevel("ERROR")

# Load the dataset
data = spark.read.csv("telecom_dataset.csv", header= True, inferSchema=True)

#Take care of the duplicates in case they are there
data.drop_duplicates()

# Select relevant columns for churn prediction
selected_columns = ["Gender", "Age", "Contract", "MonthlyCharges", "TotalCharges","Churn"]
data = data.select(selected_columns)

# Convert categorical columns to numeric using StringIndexer
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in ["Gender", "Contract","Churn"]
]
indexers.append(StringIndexer(inputCol="Churn", outputCol="label"))  # Convert churn to label column
indexer_pipeline = Pipeline(stages=indexers)
data = indexer_pipeline.fit(data).transform(data)

#Feature Engineering
# Feature Engineering.Add a new column which calculates average charges by dividing total charges by 12
data = data.withColumn("AverageCharges", col("TotalCharges")/12)
data = data.withColumn("AverageCharges", round(col("AverageCharges"), 2))

# Select the feature and label columns
feature_columns = ["Age", "Gender_index", "Age", "Contract_index","AverageCharges"]
label_column = "label"

# Convert the features columns to a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3])

# Create a Random Forest classifier
rf = RandomForestClassifier(labelCol=label_column, featuresCol="features")

# Define the parameter grid
param_grid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [3,4,5]) \
    .addGrid(rf.numTrees, [i for i in range(8,11)]) \
    .build()

# Create the evaluator
evaluator = BinaryClassificationEvaluator(labelCol=label_column, rawPredictionCol="rawPrediction")

# Create the cross-validator
cv = CrossValidator(estimator=rf, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

# Run cross-validation to find the best model
cv_model = cv.fit(train_data)

# Get the best model
best_model = cv_model.bestModel

# Make predictions on the test data using the best model
predictions = best_model.transform(test_data)

#Create Gradient Boost Model
gbt = GBTClassifier(labelCol=label_column, featuresCol="features")

#Train the gradient boost model
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)

# Evaluate the models

accuracy = evaluator.evaluate(predictions) 
print("Accuracy: for Random Forests", accuracy)

print("Best Max Depth:", best_model.getMaxDepth())
print("Best Num Trees:", best_model.getNumTrees)

area_under_curve = evaluator.evaluate(gbt_predictions)

accuracy = evaluator.evaluate(gbt_predictions) 
print("Accuracy for Gradient Boosting: ", accuracy)



# Stop the Spark session
spark.stop()


Accuracy: for Random Forests 0.8666666666666667
Best Max Depth: 3
Best Num Trees: 9
Accuracy for Gradient Boosting:  0.8


Observations: Two models were trained and evaluated i.e Random Forests and Gradient Boosting. The best model was Random forests and after Hyper parameter tuning, the best hyper parameters were 3 for maximum depth and 9 for number of trees. Random forest achieved Accuracy of 0.86 while the Gradient Boosting model achieved an accuracy of 0.8