In [1]:
from pyspark.ml.linalg import Vectors

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import col,isnan,when,count
#Libraries and class from StaticModel
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import when

# import pickle
import time

#Script Class
from PreProcess import *
from PostProcess import *

from ClassificationModel import *

In [2]:
spark = SparkSession.builder \
        .appName("Churn EDA PySpark") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()

In [71]:
# Load the data stored in e-commerce folder format as a DataFrame.
df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True)\
    .load("/home/jovyan/code/churn/credit-card/BankChurners.csv")


In [4]:

df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.show()

+---------+--------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|CLIENTNUM|Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|Naive_Bayes_Classifier_Attrition_Fl

In [72]:
pre_process_instance = PreProcess(df)

In [74]:
columns_to_string_index = [("Gender","GenderIndex"),
                           ("Education_Level","Education_LevelIndex"),
                           ("Marital_Status","Marital_StatusIndex"),
                           ("Income_Category","Income_CategoryIndex"),
("Card_Category","Card_CategoryIndex")]
columns_to_encode =[("GenderIndex","GenderIndexEncoded"),
                           ("Education_LevelIndex","Education_LevelIndexEncoded"),
                           ("Marital_StatusIndex","Marital_StatusIndexEncoded"),
                           ("Income_CategoryIndex","Income_CategoryIndexEncoded"),
("Card_CategoryIndex","Card_CategoryIndexEncoded")]

feature_cols = ["GenderIndexEncoded",
                           "Education_LevelIndexEncoded",
                           "Marital_StatusIndexEncoded",
                      "Income_CategoryIndexEncoded",
"Card_CategoryIndexEncoded" ,
                'CLIENTNUM',
  'Customer_Age',
  'Dependent_count',
  'Months_on_book',
  'Total_Relationship_Count',
  'Months_Inactive_12_mon',
  'Contacts_Count_12_mon',
  'Credit_Limit',
  'Total_Revolving_Bal',
  'Avg_Open_To_Buy',
  'Total_Amt_Chng_Q4_Q1',
  'Total_Trans_Amt',
  'Total_Trans_Ct',
  'Total_Ct_Chng_Q4_Q1',
  'Avg_Utilization_Ratio']

label_col = "Attrition_Flag"

feature_col = "features"

weight_col = "class_weights"

In [75]:
def process_data_to_model(df, columns_to_string_index: list[tuple[str]],columns_to_encode: list[tuple[str]] ,feature_cols : list, label_col:str):
    """
      Preprocess data to use it in RandomForestCassifier

       param df: Spark DataFrame to process
       param columns_to_string_index: list of tuples whit the names of cols to be indexed and the expected names [("ColName","ColNameIndex")]
       param columns_to_encode: list of tuples whit the names of cols to be encoded and the expected names [("ColName","ColNameEncoded")]
       param feature_cols: list with the names of cols that will be used as features, 
       param label_col: name of the DataFrame column that holds the label values
       return: Spark DataFrame with additional indexed, encoded and vectorized cols, and class_weights column that holds the class weights associated with each row
    """
    df = df.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1")
    df = df.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2")
    df= pre_process_instance.string_index_columns(df, [(label_col,"Churn_Index")])
    df = df.drop(label_col)
    df= pre_process_instance.string_index_columns(df, columns_to_string_index)
    df = pre_process_instance.encoded_index_columns(df, columns_to_encode)
    df = pre_process_instance.compute_class_weights_column( df, "Churn_Index")
    df = pre_process_instance.vector_feature_column(df, feature_cols) 

    return df



In [76]:
df = process_data_to_model(df, columns_to_string_index,columns_to_encode ,feature_cols, label_col)

In [77]:
total = df.filter(df.Churn_Index == 0).count()+df.filter(df.Churn_Index  == 1).count()

In [21]:
print("Not churn",df.filter(df.Churn_Index== 0).count(), "Churn",df.filter(df.Churn_Index == 1).count())

Not churn 8500 Churn 1627


In [None]:
df_transformed.select(F.countDistinct("CustomerId")).show()
categorical_cols = ['PreferredLoginDevice',
  'PreferredPaymentMode',
  'Gender',
  'PreferedOrderCat',
  'MaritalStatus']
for c in categorical_cols:
    df.select(F.countDistinct(c)).show()

In [78]:
classification_model_instance = ClassificationModel(df,feature_cols,"churn_Index")

In [79]:
(trainingData, testData) = classification_model_instance.split_data()

In [80]:
bestModel = classification_model_instance.train_classification_model("Churn_Index","features",weight_col)

In [81]:
ruta_modelo = "/home/jovyan/code/churn/modelo"
bestModel.write().overwrite().save(ruta_modelo)
#randomforest.save("/home/jovyan/code/churn/e-commerce/modelo")

#rf2.getNumTrees()

In [27]:
#from pyspark.ml.classification import RandomForestClassificationModel

In [28]:
#rf2 = RandomForestClassificationModel.load(ruta_modelo)

In [82]:
# Step 6: Make predictions
predictions = bestModel.transform(testData)

# Show predictions
predictions.select("Churn_Index", "prediction").show()


+-----------+----------+
|Churn_Index|prediction|
+-----------+----------+
|        0.0|       0.0|
|        0.0|       0.0|
|        1.0|       1.0|
|        1.0|       1.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        1.0|       1.0|
|        0.0|       1.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
|        0.0|       0.0|
+-----------+----------+
only showing top 20 rows



In [83]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Churn_Index", predictionCol="prediction", metricName="weightedRecall"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy}")

# Show detailed evaluation metrics
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(predictions)
print(f"Weighted Precision = {precision}")

evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(predictions)
print(f"Weighted Recall = {recall}")

evaluator.setMetricName("f1")
f1 = evaluator.evaluate(predictions)
print(f"F1 Score = {f1}")

Test Accuracy = 0.9714144898965007
Weighted Precision = 0.9731996809147653
Weighted Recall = 0.9714144898965007
F1 Score = 0.9719462416150377


In [84]:
df.show()

+---------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+-----------+-----------+--------------------+-------------------+--------------------+------------------+------------------+---------------------------+--------------------------+---------------------------+-------------------------+-------------+--------------------+
|CLIENTNUM|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|Churn_Index|GenderIndex|Education_LevelIndex|Marital_Status

In [85]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [63]:
predictions_and_labels = predictions.select(col("prediction").cast("float"), col("Churn_Index").cast("float"))

metrics = MulticlassMetrics(predictions_and_labels.rdd.map(tuple))

# Print confusion matrix
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())



Confusion Matrix:
[[1592.   36.]
 [  13.  278.]]


In [64]:
rmse = evaluator.evaluate(predictions)
print("Test RMSE = %g" % rmse)

Test RMSE = 0.97486


In [113]:
feature_importances = bestModel.featureImportances.toArray().tolist()

In [114]:
feature_names = df.columns

In [115]:
# Create a list of (name, importance) tuples
feature_data = list(zip(feature_names, feature_importances))

# Define the schema
schema = StructType([
    StructField("feature", StringType(), True),
    StructField("importance", FloatType(), True)
])

# Create a DataFrame from the list of tuples with the specified schema
feature_importances_df = spark.createDataFrame(feature_data, schema)

In [119]:
from pyspark.sql.functions import desc, asc

In [156]:
filter_condition = (feature_importances_df.feature == 'GenderIndexEncoded') | \
(feature_importances_df.feature == 'Education_LevelIndexEncoded') | \
(feature_importances_df.feature == 'Marital_StatusIndexEncoded') | \
(feature_importances_df.feature == 'Income_CategoryIndexEncoded') | \
(feature_importances_df.feature == 'Card_CategoryIndexEncoded') | \
(feature_importances_df.feature == 'CLIENTNUM') | \
(feature_importances_df.feature == 'Customer_Age') | \
(feature_importances_df.feature == 'Dependent_count') | \
(feature_importances_df.feature == 'Months_on_book') | \
(feature_importances_df.feature == 'Total_Relationship_Count') | \
(feature_importances_df.feature == 'Months_Inactive_12_mon') | \
(feature_importances_df.feature == 'Contacts_Count_12_mon') | \
(feature_importances_df.feature == 'Credit_Limit') | \
(feature_importances_df.feature == 'Total_Revolving_Bal') | \
(feature_importances_df.feature == 'Avg_Open_To_Buy') | \
(feature_importances_df.feature == 'Total_Amt_Chng_Q4_Q1') | \
(feature_importances_df.feature == 'Total_Trans_Amt') | \
(feature_importances_df.feature == 'Total_Trans_Ct') | \
(feature_importances_df.feature == 'Total_Ct_Chng_Q4_Q1') | \
(feature_importances_df.feature == 'Avg_Utilization_Ratio')

In [157]:
feature_importances_df.filter(filter_condition).orderBy(desc("importance")).show()

+--------------------+------------+
|             feature|  importance|
+--------------------+------------+
|Card_CategoryInde...|  0.23377897|
|Income_CategoryIn...|  0.18829289|
|  GenderIndexEncoded|  0.09279693|
|Marital_StatusInd...|  0.05734704|
|Education_LevelIn...| 0.025543414|
|Avg_Utilization_R...|  0.02387299|
| Total_Ct_Chng_Q4_Q1|  0.01757825|
|           CLIENTNUM|  0.00807443|
|      Months_on_book| 0.002485119|
|Months_Inactive_1...|0.0019926066|
|        Credit_Limit|0.0019610028|
| Total_Revolving_Bal|0.0019062097|
|        Customer_Age|0.0016227629|
|Contacts_Count_12...|0.0015276278|
|Total_Amt_Chng_Q4_Q1|0.0013650599|
|      Total_Trans_Ct|0.0011894779|
|     Dependent_count| 0.001136427|
|Total_Relationshi...|0.0010880386|
|     Total_Trans_Amt|0.0010273004|
|     Avg_Open_To_Buy|0.0010094133|
+--------------------+------------+



In [166]:
feature_importances_df.count()

33

In [38]:
sum(feature_importances)

1.0

In [163]:
from pyspark.sql.functions import sum
df.select(sum(df.fee)).show()

In [165]:
feature_importances_df.select(sum(feature_importances_df.filter(filter_condition).orderBy(desc("importance")).importance)).show()

+------------------+
|   sum(importance)|
+------------------+
|1.0000000015133992|
+------------------+

