In [1]:
import pyspark
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, isnull, sum, median, mean, first, max, min, mode
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

from pyspark.ml.feature import Bucketizer, QuantileDiscretizer
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer,
    OneHotEncoder,
    VectorAssembler,
    StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
spark = SparkSession.builder.appName("test").getOrCreate()

# Load data

In [4]:
df=spark.read.csv('drive/MyDrive/Colab Notebooks/cust_data_with_feature_engineering.csv',inferSchema=True,header=True)

In [5]:
df.show()

+---+------+----+-------------------+---------+------------------------------+--------+--------+-------------------+-------+-----+----+--------+-------------+--------------+--------------+----------------+---------------------+-------------------+
|_c0|Gender| Age|Has_Mobile_Contract|Area_Code|Currently_Holds_Second_Product|  Tenure|App_User|Acquisition_Channel|Revenue|Label|bins|age_bins|Revenue_Group|Gender_Numeric|Tenure_Numeric|App_User_Numeric|Area_Code_Avg_Revenue|Channel_Avg_Revenue|
+---+------+----+-------------------+---------+------------------------------+--------+--------+-------------------+-------+-----+----+--------+-------------+--------------+--------------+----------------+---------------------+-------------------+
|  0|Female|48.0|                  1|        1|                             1|1-2 Year|      No|                 12|   2.63|    0| 3.0|   46-55|          Low|             1|             1|               1|   2.7112896825397494|  33.57289910600262|
|  1|  M

# Train - Test split

In [None]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Define features
categorical_features = ['Area_Code', 'Tenure', 'Acquisition_Channel']
numerical_features = ['Age', 'Revenue']

# Create preprocessing stages for categorical features
string_indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=f"{col}_indexed",
        handleInvalid="keep"
    ) for col in categorical_features
]

encoders = [
    OneHotEncoder(
        inputCol=f"{col}_indexed",
        outputCol=f"{col}_encoded"
    ) for col in categorical_features
]

# Create preprocessing stages for numerical features
numerical_assembler = VectorAssembler(
    inputCols=numerical_features,
    outputCol="numerical_features"
)

numerical_scaler = StandardScaler(
    inputCol="numerical_features",
    outputCol="scaled_numerical_features",
    withStd=True,
    withMean=True
)

# Get the encoded categorical feature column names
encoded_categorical_features = [f"{col}_encoded" for col in categorical_features]

# Combine all features (categorical and numerical) into a single vector
feature_assembler = VectorAssembler(
    inputCols=encoded_categorical_features + ["scaled_numerical_features"],
    outputCol="features"
)

# Create the random forest classifier
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=100,
)

# Create the pipeline
pipeline = Pipeline(stages=[
    *string_indexers,    # Unpack all string indexers
    *encoders,           # Unpack all encoders
    numerical_assembler,
    numerical_scaler,
    feature_assembler,
    rf
])

# Fit the pipeline on training data
model = pipeline.fit(train_df)

# Make predictions on test data
predictions = model.transform(test_df)