#**INDEX**:
    
*1. Labelling the data.*

*2. Refining and Creating features with the available dataframe.*

*3. Creating a Machine Learning pipeline to preprocess the data; Training and Tuning the model.*

*4. Model selection and performing a performance projection using the test set: what is the expected uplift in conversion if we show different home versions to each customer instead of selecting the best one base on the average?*

In [0]:
#Setting the Default Spark Loggin Level:

spark.sparkContext.setLogLevel("WARN")

In [0]:
#Importing Functions:

from pyspark.sql import functions as f
from pyspark.sql.types import StringType, ArrayType, LongType, DateType, BooleanType, StructType, StructField
from pyspark.sql.window import Window
from pyspark.sql import types as tp
from pyspark.sql.functions import col, udf, explode, collect_list, element_at, to_timestamp, mean, array_contains, when, date_format
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, VectorAssembler, OneHotEncoder, StringIndexer, RobustScaler
from pyspark.sql.functions import countDistinct, avg, stddev

In [0]:
#Downloading the DataSet needed:

In [0]:
%sh

wget https://www.dropbox.com/s/y7o6ay0ozwnsfd0/bdm_data.zip --quiet

unzip -d ./bdm_data/ bdm_data.zip

Archive:  bdm_data.zip
   creating: ./bdm_data/bdm_data/cust_df/
  inflating: ./bdm_data/bdm_data/cust_df/_committed_3520508812338357534  
  inflating: ./bdm_data/bdm_data/cust_df/_committed_5076822134895256271  
  inflating: ./bdm_data/bdm_data/cust_df/_committed_7878371389005906564  
 extracting: ./bdm_data/bdm_data/cust_df/_started_5076822134895256271  
 extracting: ./bdm_data/bdm_data/cust_df/_SUCCESS  
  inflating: ./bdm_data/bdm_data/cust_df/part-00000-tid-5076822134895256271-fa5ebda2-8174-4bce-b5ef-fec6a0376257-23668-1-c000.csv  
   creating: ./bdm_data/bdm_data/orders_df/
  inflating: ./bdm_data/bdm_data/orders_df/_committed_1102646042990821830  
  inflating: ./bdm_data/bdm_data/orders_df/_committed_1749678841354862380  
  inflating: ./bdm_data/bdm_data/orders_df/_committed_4173638812266093034  
  inflating: ./bdm_data/bdm_data/orders_df/_committed_4196442019492113282  
  inflating: ./bdm_data/bdm_data/orders_df/_committed_568900438245366187  
 extracting: ./bdm_data/bdm_data/o

In [0]:
#Moving the files from the local fs to the dbfs:

dbutils.fs.mv("file:/databricks/driver/bdm_data/", "dbfs:/FileStore/bdm_data/", True)

Out[4]: True

In [0]:
display(dbutils.fs.ls("/FileStore/bdm_data/"))

path,name,size,modificationTime
dbfs:/FileStore/bdm_data/bdm_data/,bdm_data/,0,0
dbfs:/FileStore/bdm_data/cust_df/,cust_df/,0,0
dbfs:/FileStore/bdm_data/orders_df/,orders_df/,0,0
dbfs:/FileStore/bdm_data/sessions_df/,sessions_df/,0,0


In [0]:
#Reading the Datasets:

orders_df = \
    spark.read.format("parquet") \
    .option("inferSchema", "true") \
    .load("dbfs:/FileStore/bdm_data/orders_df")

cust_df = (
    spark.read
    .format("csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load("dbfs:/FileStore/bdm_data/cust_df/")
)

# For faster reading time, lets define the schema of Sessions_df

sessions_df_schema = StructType([
    StructField("customer_id",StringType(),True),
    StructField(
        "session_events",
        ArrayType(
            StructType([
                StructField("datetime", StringType(),True),
                StructField("event", StringType(),True)
            ]), True)
    ),
    StructField("session_id", StringType() ,True),
    StructField("session_rank", LongType(), True)
])

sessions_df = \
    spark.read.format("json") \
    .schema(sessions_df_schema) \
    .load("dbfs:/FileStore/bdm_data/sessions_df")

In [0]:
sessions_df.display(5)

customer_id,session_events,session_id,session_rank
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-04-17 13:37:15.957171, OpenApp), List(2022-04-17 13:37:27.126802, ViewHome), List(2022-04-17 13:38:47.358932, ViewSearch), List(2022-04-17 13:38:40.752893, CloseApp))",c6be7a50-cb0c-4c0c-81ca-3ed39c093718,0
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-03-31 18:32:15.958422, OpenApp), List(2022-03-31 18:32:24.135571, ViewHomeVariant), List(2022-03-31 18:33:44.611742, ViewList3), List(2022-03-31 18:33:50.718991, CloseApp))",5c3a3b8b-4f4b-4cbd-a7e5-5a23e54fb31b,1
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-03-21 14:42:15.959261, OpenApp), List(2022-03-21 14:42:19.472056, ViewHomeVariant), List(2022-03-21 14:44:34.538973, ViewCartCheckout), List(2022-03-21 14:43:54.681148, CloseApp))",a7547d25-f265-4457-b915-234c8732f642,2
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2021-10-30 23:59:15.960155, OpenApp), List(2021-10-30 23:59:22.620350, ViewHome), List(2021-10-31 00:01:54.519225, ViewSearch), List(2021-10-31 00:01:32.534300, CloseApp))",5ce162e4-2243-4f08-88a7-841fd89e860a,3
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2021-12-06 22:26:15.961028, OpenApp), List(2021-12-06 22:26:21.990505, ViewHome), List(2021-12-06 22:27:36.865840, ViewList2), List(2021-12-06 22:27:57.372898, CloseApp))",9af3c534-ed1b-4462-ba58-a478bbf9647f,4
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-01-04 20:19:15.961890, OpenApp), List(2022-01-04 20:19:23.117717, ViewHomeVariant), List(2022-01-04 20:21:10.343633, ViewList1), List(2022-01-04 20:20:15.479881, CloseApp))",e61e17e7-4a1e-4fa1-a0fa-8e2fb78fd552,5
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-02-23 23:40:15.962851, OpenApp), List(2022-02-23 23:40:22.790663, ViewHomeVariant), List(2022-02-23 23:42:48.297926, ViewList3), List(2022-02-23 23:42:04.507958, CloseApp))",f4e021ba-dea6-44e6-b88b-38c61204ecc3,6
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-02-23 22:41:15.963841, OpenApp), List(2022-02-23 22:41:24.333183, ViewHome), List(2022-02-23 22:42:22.714738, CloseApp))",097d748a-4a7d-41d9-b30f-da81e7d8369e,7
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2021-12-31 13:12:15.964501, OpenApp), List(2021-12-31 13:12:21.477333, ViewHome), List(2021-12-31 13:13:45.695942, CloseApp))",9e46bff4-c850-4c58-adad-29be95e2af3f,8
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-04-02 22:10:15.965135, OpenApp), List(2022-04-02 22:10:27.726779, ViewHome), List(2022-04-02 22:12:29.964809, ViewList2), List(2022-04-02 22:12:19.465288, CloseApp))",74a59265-d116-4a17-8500-6f9515691551,9


**Question 1: Labelling the data.**

In [0]:
#In order to get the time from the events and sessions, we created some new columns:

Sessions_Dataframe = (
    sessions_df
        .withColumn("DateTime", f.col('session_events.datetime'))
        .withColumn('Session_TimeStamp', f.col('session_events')[0].getItem('datetime'))
        .withColumn('Month', f.date_trunc('Month', f.col('Session_TimeStamp')).astype('date') )
        .withColumn('Session_Started', f.to_timestamp(f.col('datetime')[0]))
        .withColumn('Session_Ended', f.to_timestamp(f.array_max(f.col('datetime'))))
        .withColumn('Time of Session', f.col('Session_Ended').cast('short') - f.col('Session_Started').cast('short')))

In [0]:
Sessions_Dataframe.display(5)

customer_id,session_events,session_id,session_rank,DateTime,Session_TimeStamp,Month,Session_Started,Session_Ended,Time of Session
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-04-17 13:37:15.957171, OpenApp), List(2022-04-17 13:37:27.126802, ViewHome), List(2022-04-17 13:38:47.358932, ViewSearch), List(2022-04-17 13:38:40.752893, CloseApp))",c6be7a50-cb0c-4c0c-81ca-3ed39c093718,0,"List(2022-04-17 13:37:15.957171, 2022-04-17 13:37:27.126802, 2022-04-17 13:38:47.358932, 2022-04-17 13:38:40.752893)",2022-04-17 13:37:15.957171,2022-04-01,2022-04-17T13:37:15.957+0000,2022-04-17T13:38:47.358+0000,92
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-03-31 18:32:15.958422, OpenApp), List(2022-03-31 18:32:24.135571, ViewHomeVariant), List(2022-03-31 18:33:44.611742, ViewList3), List(2022-03-31 18:33:50.718991, CloseApp))",5c3a3b8b-4f4b-4cbd-a7e5-5a23e54fb31b,1,"List(2022-03-31 18:32:15.958422, 2022-03-31 18:32:24.135571, 2022-03-31 18:33:44.611742, 2022-03-31 18:33:50.718991)",2022-03-31 18:32:15.958422,2022-03-01,2022-03-31T18:32:15.958+0000,2022-03-31T18:33:50.718+0000,95
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-03-21 14:42:15.959261, OpenApp), List(2022-03-21 14:42:19.472056, ViewHomeVariant), List(2022-03-21 14:44:34.538973, ViewCartCheckout), List(2022-03-21 14:43:54.681148, CloseApp))",a7547d25-f265-4457-b915-234c8732f642,2,"List(2022-03-21 14:42:15.959261, 2022-03-21 14:42:19.472056, 2022-03-21 14:44:34.538973, 2022-03-21 14:43:54.681148)",2022-03-21 14:42:15.959261,2022-03-01,2022-03-21T14:42:15.959+0000,2022-03-21T14:44:34.538+0000,139
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2021-10-30 23:59:15.960155, OpenApp), List(2021-10-30 23:59:22.620350, ViewHome), List(2021-10-31 00:01:54.519225, ViewSearch), List(2021-10-31 00:01:32.534300, CloseApp))",5ce162e4-2243-4f08-88a7-841fd89e860a,3,"List(2021-10-30 23:59:15.960155, 2021-10-30 23:59:22.620350, 2021-10-31 00:01:54.519225, 2021-10-31 00:01:32.534300)",2021-10-30 23:59:15.960155,2021-10-01,2021-10-30T23:59:15.960+0000,2021-10-31T00:01:54.519+0000,159
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2021-12-06 22:26:15.961028, OpenApp), List(2021-12-06 22:26:21.990505, ViewHome), List(2021-12-06 22:27:36.865840, ViewList2), List(2021-12-06 22:27:57.372898, CloseApp))",9af3c534-ed1b-4462-ba58-a478bbf9647f,4,"List(2021-12-06 22:26:15.961028, 2021-12-06 22:26:21.990505, 2021-12-06 22:27:36.865840, 2021-12-06 22:27:57.372898)",2021-12-06 22:26:15.961028,2021-12-01,2021-12-06T22:26:15.961+0000,2021-12-06T22:27:57.372+0000,102
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-01-04 20:19:15.961890, OpenApp), List(2022-01-04 20:19:23.117717, ViewHomeVariant), List(2022-01-04 20:21:10.343633, ViewList1), List(2022-01-04 20:20:15.479881, CloseApp))",e61e17e7-4a1e-4fa1-a0fa-8e2fb78fd552,5,"List(2022-01-04 20:19:15.961890, 2022-01-04 20:19:23.117717, 2022-01-04 20:21:10.343633, 2022-01-04 20:20:15.479881)",2022-01-04 20:19:15.961890,2022-01-01,2022-01-04T20:19:15.961+0000,2022-01-04T20:21:10.343+0000,115
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-02-23 23:40:15.962851, OpenApp), List(2022-02-23 23:40:22.790663, ViewHomeVariant), List(2022-02-23 23:42:48.297926, ViewList3), List(2022-02-23 23:42:04.507958, CloseApp))",f4e021ba-dea6-44e6-b88b-38c61204ecc3,6,"List(2022-02-23 23:40:15.962851, 2022-02-23 23:40:22.790663, 2022-02-23 23:42:48.297926, 2022-02-23 23:42:04.507958)",2022-02-23 23:40:15.962851,2022-02-01,2022-02-23T23:40:15.962+0000,2022-02-23T23:42:48.297+0000,153
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-02-23 22:41:15.963841, OpenApp), List(2022-02-23 22:41:24.333183, ViewHome), List(2022-02-23 22:42:22.714738, CloseApp))",097d748a-4a7d-41d9-b30f-da81e7d8369e,7,"List(2022-02-23 22:41:15.963841, 2022-02-23 22:41:24.333183, 2022-02-23 22:42:22.714738)",2022-02-23 22:41:15.963841,2022-02-01,2022-02-23T22:41:15.963+0000,2022-02-23T22:42:22.714+0000,67
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2021-12-31 13:12:15.964501, OpenApp), List(2021-12-31 13:12:21.477333, ViewHome), List(2021-12-31 13:13:45.695942, CloseApp))",9e46bff4-c850-4c58-adad-29be95e2af3f,8,"List(2021-12-31 13:12:15.964501, 2021-12-31 13:12:21.477333, 2021-12-31 13:13:45.695942)",2021-12-31 13:12:15.964501,2021-12-01,2021-12-31T13:12:15.964+0000,2021-12-31T13:13:45.695+0000,90
828d7bdf-96eb-4e61-af45-69dcc8ec73be,"List(List(2022-04-02 22:10:15.965135, OpenApp), List(2022-04-02 22:10:27.726779, ViewHome), List(2022-04-02 22:12:29.964809, ViewList2), List(2022-04-02 22:12:19.465288, CloseApp))",74a59265-d116-4a17-8500-6f9515691551,9,"List(2022-04-02 22:10:15.965135, 2022-04-02 22:10:27.726779, 2022-04-02 22:12:29.964809, 2022-04-02 22:12:19.465288)",2022-04-02 22:10:15.965135,2022-04-01,2022-04-02T22:10:15.965+0000,2022-04-02T22:12:29.964+0000,134


In [0]:
Sessions_Dataframe.count()

Out[11]: 5344922

In [0]:
# First, and due to the project's specifications, we needed to filter the last three months of our dataset.
# We also created some new variables that will be used in our prediction models. These are: 
#  - View_List3_&_Bought
#  - Convertion_rate_List3
#  - Average_Session_Time
#  - Target

Sessions_DataFrame = (
      Sessions_Dataframe
     .withColumn("Event", f.col('session_events.event'))
     .withColumn('Month', f.date_trunc('Month', f.col('Session_TimeStamp')).astype('date').astype('string'))
     .filter(f.col('Month')>='2022-02-01')
     .filter(f.col('Month')<='2022-04-01')
     .withColumn('View_Home_Variant', f.when(f.array_contains(f.col('Event'), 'ViewHomeVariant'), f.lit(1)).otherwise(f.lit(0)))
     .filter(f.col('View_Home_Variant')==1)
     .withColumn('ViewList3', f.when(f.array_contains(f.col('event'), 'ViewList3'), f.lit(1)).otherwise(f.lit(0)))
     .filter(f.col('ViewList3')==1)
     .withColumn('Converted', f.when(f.array_contains(f.col('Event'), 'CallbackPurchase'), f.lit(1)).otherwise(f.lit(0)))
     .groupBy(f.col('Month'), 'customer_id')
     .agg(
        f.sum(f.when( (f.col('ViewList3')==1), f.lit(1)).otherwise(f.lit(0)) ).alias('View_list3'),
        f.sum(f.when( (f.col('ViewList3')==1) & (f.col('Converted')==1), f.lit(1)).otherwise(f.lit(0)) ).alias('View_List3_&_Bought'),
        f.avg(f.col('Time of Session')).alias('Average_Session_Time'))
     .withColumn('Target', f.when(f.col('View_List3_&_Bought')>0, f.lit(1)).otherwise(f.lit(0)))
     .withColumn('Convertion_rate_List3', f.col('View_List3_&_Bought') / f.col('View_List3') * 100)
     .drop('View_Home_Variant')
     .drop('Session_TimeStamp')
     .drop('ViewList3')
     .drop('View_List3')
    )

In [0]:
Sessions_DataFrame.display(5)

Month,customer_id,View_List3_&_Bought,Average_Session_Time,Target,Convertion_rate_List3
2022-03-01,e7f524b3-c4a7-4d17-acc9-f147377d5187,0,107.0,0,0.0
2022-04-01,a0a3d963-96b6-4206-80fc-eae77f774abb,0,126.66666666666669,0,0.0
2022-04-01,4a05be7d-3081-4392-bd19-8c6d85f67781,0,110.8,0,0.0
2022-04-01,5ad8e54e-1f62-47ef-ab26-fb9240b78f32,0,152.0,0,0.0
2022-03-01,10bb2f5f-26c8-43b2-8ce5-5cbd91640754,0,111.83333333333331,0,0.0
2022-04-01,05ab8edc-157a-4a6e-a4e8-018c512dee11,0,99.0,0,0.0
2022-04-01,a1429c7f-b228-4f7c-bec9-0e372d4db568,0,95.0,0,0.0
2022-03-01,c0123e58-37f4-4e5b-91ea-d77c6bd3b1c8,0,153.0,0,0.0
2022-04-01,636e9244-9156-4084-b07d-91cdb943fe8d,0,111.33333333333331,0,0.0
2022-04-01,7189d2fc-493d-4b48-893f-56906c2ee459,0,122.0,0,0.0


In [0]:
# To see how many observations we were left with after filtering, we did a count. After filtering, about 5139 542 observations were removed.

Sessions_DataFrame.count()

Out[14]: 205380

**Question 2: Refining and Creating features with the available dataframe.**

In [0]:
# The main objetive in this phase is to create and refine variables using the different datasets available. After, we need to join the different datasets, so we can procede to the modeling phase.

Orders_Dataframe= (
    orders_df
    .toDF('Order_ID', 'Session_ID', 'Order_Timestamp', 'Customer_ID', 'Total_Value', 'Discount_Value', 'Order_Category')
    .withColumn('Order_TimeStamp', f.col('Order_Timestamp').astype('timestamp'))
    .withColumn('Month', f.date_trunc('Month', f.col('Order_Timestamp')).astype('date') )
)

In [0]:
Orders_Dataframe.display(5)

Order_ID,Session_ID,Order_TimeStamp,Customer_ID,Total_Value,Discount_Value,Order_Category,Month
79b00fa4-becd-4ec0-81dc-f31f13b66de6,5da07198-2b4c-4110-90b4-6aa6af8d3320,2022-04-05T13:31:43.197+0000,8d8bbd13-93a0-471e-a531-1853352555d0,55.3,0.0,Pizza,2022-04-01
1b90581a-2b81-486c-ad24-ea8c30b3357d,6d6d7281-7885-45bb-b67d-f4503c3d16d6,2022-01-21T20:16:05.580+0000,8d8bbd13-93a0-471e-a531-1853352555d0,67.0,0.0,Alc Beverages,2022-01-01
54b076d9-b625-46dc-9736-b36678fa7621,f8cf2f0c-c306-41b9-b90b-eb7505267da4,2022-01-15T17:08:08.965+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,96.4,0.0,Burger,2022-01-01
04c9e979-dac6-4115-bfb9-c31e2996ec46,490fda88-fc4a-4427-87dd-7ea257a48df3,2022-03-18T17:19:37.670+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,100.5,0.0,Burger,2022-03-01
4d909377-d214-4093-b973-35275f7b659e,31875064-7d5c-4751-98cc-8fa3469fd9d9,2022-04-03T21:13:07.433+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,86.8,0.0,Alc Beverages,2022-04-01
2c859285-7949-4c07-983c-cc3abf284902,d3a64874-58d5-4697-ae8d-fe58a166b8cd,2021-06-11T20:03:35.160+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,158.9,0.0,Alc Beverages,2021-06-01
e2070954-39a2-435b-ae84-e2a74f427b5e,3f896436-7e0f-451f-8d3e-927df548d3c9,2022-03-02T12:24:19.527+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,119.8,0.0,Alc Beverages,2022-03-01
0ca541b6-65d9-477e-9bcd-2d6cfe33a1e2,ecd84a5c-a436-487c-b793-818d5183d775,2022-03-25T11:09:08.483+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,95.5,0.0,Burger,2022-03-01
101bdc15-aa53-42a8-a574-12eae99bef42,7e361a74-9ba2-45e5-a199-0f8b22abccf6,2022-03-18T15:50:51.138+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,103.0,0.0,Burger,2022-03-01
e5b620d8-c916-4bf3-a503-7e49cf16ff85,6651d4c8-d600-488d-8e76-e2a2275a4ffc,2022-04-20T23:03:07.706+0000,a58fa5da-c1c7-4698-99a2-9ff402ac7bc5,76.0,0.0,Pizza,2022-04-01


In [0]:
#Using the Customer Dataframe, we decided to create two new columns (Install_(With Investment and Install_(Without Investment). The idea here, and assuming that the column "install_origin" is related to the way on how the users decided to install the app, is to differenciate the paid sources (Meta and SMS) from the free ones (Email and Organic).

Customer_Dataframe = (
    cust_df
    .withColumn('Install_(With Investment)', f.when((f.col('install_origin') == "Meta") | (f.col('install_origin') == "SMS" ), True).astype('int'))
    .withColumn('Install_(Without Investment)', f.when((f.col('install_origin') == "Organic") | (f.col('install_origin') == "Email" ), True).astype('int'))
    .drop("city")
    .drop("install_origin")
)

In [0]:
Customer_Dataframe.display(5)

customer_id,is_referee,device_type,Install_(With Investment),Install_(Without Investment)
828d7bdf-96eb-4e61-af45-69dcc8ec73be,False,Low-End,,1.0
db024faf-8d37-4519-b4fe-38e90f7868b3,False,High-End,1.0,
877b2d99-ae5d-482f-8f18-5ed92ac9d2ee,True,High-End,1.0,
4778c29e-9176-48d6-ac8f-6a4f8c596cdb,False,High-End,1.0,
2f43a584-3897-4e2b-8152-fff6acdd3c2c,False,High-End,,1.0
3876600f-493b-453d-9c56-9fdf06c3a097,False,Low-End,1.0,
62939050-7340-46a7-93c3-9e30425fc9da,False,High-End,1.0,
74aa4cda-8dfb-4639-8a57-051e26215113,False,High-End,,1.0
808513ca-d8be-4416-82f5-66cf56cf9c92,False,High-End,1.0,
73ddae0c-3c6d-4755-8190-32a7c24a2aee,False,High-End,,1.0


In [0]:
#Since we have observations with null values on both of the new columns, we need to fill them with 0. 

Customer_DataFrame = (
    Customer_Dataframe
    .fillna({"Install_(Without Investment)": '0'})
    .fillna({"Install_(With Investment)": '0'})   
)

In [0]:
Customer_DataFrame.display(5)

customer_id,is_referee,device_type,Install_(With Investment),Install_(Without Investment)
828d7bdf-96eb-4e61-af45-69dcc8ec73be,False,Low-End,0,1
db024faf-8d37-4519-b4fe-38e90f7868b3,False,High-End,1,0
877b2d99-ae5d-482f-8f18-5ed92ac9d2ee,True,High-End,1,0
4778c29e-9176-48d6-ac8f-6a4f8c596cdb,False,High-End,1,0
2f43a584-3897-4e2b-8152-fff6acdd3c2c,False,High-End,0,1
3876600f-493b-453d-9c56-9fdf06c3a097,False,Low-End,1,0
62939050-7340-46a7-93c3-9e30425fc9da,False,High-End,1,0
74aa4cda-8dfb-4639-8a57-051e26215113,False,High-End,0,1
808513ca-d8be-4416-82f5-66cf56cf9c92,False,High-End,1,0
73ddae0c-3c6d-4755-8190-32a7c24a2aee,False,High-End,0,1


In [0]:
# Since the Orders dataset is the most complete one, we need to have all variables present in this dataset. 

w_lag = Window.partitionBy(f.col("customer_id")).orderBy(f.col("order_timestamp"))
time_window = Window.partitionBy('customer_id')

Orders_Features = (
    Orders_Dataframe
     .filter(f.col('Month')>='2022-02-01')
     .filter(f.col('Month')<='2022-04-01')
    .withColumn('Monetary', f.col('Total_Value') - f.col('Discount_Value'))
    .withColumn("Last_Order", f.max("Order_Timestamp").over(w_lag))
    .withColumn("Recency", f.datediff(f.current_date(), f.col('Last_Order')))
    .withColumn("Order_Date_Lag", f.lag("Order_Timestamp", offset=1, default=None).over(w_lag))
    .withColumn("Days_Since_Last_Order", f.datediff(f.col('Order_Timestamp'), f.col('Order_Date_Lag')))
    .withColumn('Order_Hour', f.hour(col('Order_Timestamp')) )
    .withColumn('Shift', f.when(f.col('Order_Hour') <= 10, 'BreakFast').when(f.col('Order_Hour')<=17, 'Lunch' ).otherwise('Dinner'))
    .withColumn('Discount_Percentage', f.round(f.col('Discount_Value') / f.col('Total_Value'), 2) )
    .withColumn('Discount_Range', f.when(f.col('Discount_Percentage') <= 0.10, '0-10%').when(f.col('Discount_Percentage') <= 0.20, '10-20%').otherwise('30%+'))
    .withColumn('Japanese', f.when(f.col('Order_Category') == 'Japanese', f.col('Monetary')).otherwise(0))
    .withColumn('Pizza', f.when(f.col('Order_Category') == 'Pizza', f.col('Monetary')).otherwise(0))
    .withColumn('Burger', f.when(f.col('Order_Category') == 'Burger', f.col('Monetary')).otherwise(0))
    .withColumn('Vegetarian', f.when(f.col('Order_Category') == 'Vegetarian', f.col('Monetary')).otherwise(0))
    .withColumn('Alc Beverages', f.when(f.col('Order_Category') == 'Alc Beverages', f.col('Monetary')).otherwise(0))
    .withColumn('First_Order_TimeStamp', f.min(f.col('Order_Timestamp')).over(time_window))
    .withColumn('Days_Since_First_Order', f.datediff(f.col('Order_Timestamp'), f.col('First_Order_TimeStamp')))
)

In [0]:
Input_Data = (
    Orders_Features
    .groupBy('Month', 'Customer_ID')
    .agg(
        f.countDistinct('Order_ID').alias('Frequency'),
        f.round(f.avg('Days_Since_Last_Order'),1).alias('Average_Time_to_Reorder'),
        f.round(f.sum('Monetary'),1).alias('Monetary'),
        f.max('Recency').alias('Recency'),
        f.round(f.avg('Order_Hour'),0).alias('Average_Order_Hour'),
        f.round(f.avg('Discount_Percentage'),3).alias('Average_Discount_Percentage'),
        f.max('Discount_Percentage').alias('Best_Discount_per_Month'),     
        f.round(f.sum("Japanese")/ f.sum('total_value'),2).alias('Japanese_Value_Ratio'),
        f.round(f.sum("Pizza")/ f.sum('Monetary'),2).alias('Pizza_Value_Ratio'),
        f.round(f.sum("Burger")/ f.sum('Monetary'),2).alias('Burger_Value_Ratio'),
        f.round(f.sum("Vegetarian")/ f.sum('Monetary'),2).alias('Vegetarian_Value_Ratio'),
        f.round(f.sum("Alc Beverages")/ f.sum('Monetary'),2).alias('Alc Beverages_Value_Ratio'),
        f.sum(f.when( (f.col('Shift') == "BreakFast"), f.lit(1)).otherwise(f.lit(0)) ).alias('BreakFast'),
        f.sum(f.when( (f.col('Shift') == "Lunch"), f.lit(1)).otherwise(f.lit(0)) ).alias('Lunch'),
        f.sum(f.when( (f.col('Shift') == "Dinner"), f.lit(1)).otherwise(f.lit(0)) ).alias('Dinner'))
     .withColumn("BreakFast_Ratio",  f.round(f.col("BreakFast")/ f.col('frequency'), 2))
     .withColumn("Lunch_Ratio",  f.round(f.col("Lunch")/ f.col('Frequency'), 2))
     .withColumn("Dinner_Ratio",  f.round(f.col("Dinner")/ f.col('Frequency'), 2))
     .withColumn("Average_Money_Spent", f.round(f.col('Monetary') / f.col('Frequency'), 2))
    .drop('BreakFast')
    .drop('Dinner')
    .drop('Lunch')
)

In [0]:
input_data1 = (
    Input_DaTa
    .fillna({"Average_Time_to_Reorder": '0'})
)

In [0]:
Input_DaTa = (
    Input_Data
     .join(Sessions_DataFrame, ['Customer_ID', 'Month'], 'inner')
)

In [0]:
Predictions_Data = (
    Input_DaTa
    .join(Customer_DataFrame, ['customer_id'], 'inner')
    .drop()
)

In [0]:
Predictions_Data.display(5)

Customer_ID,Month,Frequency,Average_Time_to_Reorder,Monetary,Recency,Average_Order_Hour,Average_Discount_Percentage,Best_Discount_per_Month,Japanese_Value_Ratio,Pizza_Value_Ratio,Burger_Value_Ratio,Vegetarian_Value_Ratio,Alc Beverages_Value_Ratio,BreakFast_Ratio,Lunch_Ratio,Dinner_Ratio,Average_Money_Spent,View_List3_&_Bought,Average_Session_Time,Target,Convertion_rate_List3,is_referee,device_type,Install_(With Investment),Install_(Without Investment)
cb8edd68-6a59-4b71-87f3-4ec547866a16,2022-03-01,1,8.0,60.9,106,14.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,60.9,0,125.75,0,0.0,True,Low-End,0,1
cc9d9d2b-d367-4c8d-a5d7-dd10ffcdeb6a,2022-04-01,1,15.0,53.5,66,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,53.5,0,117.5,0,0.0,True,Low-End,1,0
cdecd3af-be67-4ac2-ae47-7b420774053c,2022-03-01,5,5.0,478.6,99,17.0,0.0,0.0,0.0,0.21,0.0,0.79,0.0,0.0,0.4,0.6,95.72,0,139.0,0,0.0,False,High-End,0,1
cf029240-b265-4cd3-a56e-66cd31e5a038,2022-04-01,4,3.5,232.6,79,16.0,0.0,0.0,0.0,0.0,0.28,0.0,0.72,0.0,0.5,0.5,58.15,0,119.0,0,0.0,False,Low-End,1,0
cf15a938-856a-4835-b365-276520297daa,2022-03-01,2,15.0,191.8,87,15.0,0.0,0.0,0.0,0.0,0.0,0.49,0.51,0.0,0.5,0.5,95.9,0,130.66666666666666,0,0.0,False,High-End,0,1
d47fb717-2793-43a2-9384-8a7737ee42e4,2022-04-01,4,5.0,206.8,79,19.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.25,0.75,51.7,0,113.0,0,0.0,True,Low-End,1,0
d4ccb467-32b5-4c7f-b7e2-b4df69a64854,2022-03-01,2,21.5,112.8,101,19.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.5,56.4,0,105.75,0,0.0,False,Low-End,0,1
d5d33db9-6de9-43b1-9f9a-867fdf8825e9,2022-02-01,2,5.0,107.0,138,21.0,0.0,0.0,0.0,0.0,0.0,0.6,0.4,0.0,0.0,1.0,53.5,0,70.0,0,0.0,False,High-End,1,0
d5eba3c0-3084-489a-b39b-7de63b42f3e6,2022-03-01,20,1.5,6477.7,108,15.0,0.0,0.0,0.0,0.59,0.39,0.02,0.0,0.0,0.75,0.25,323.89,0,116.14285714285714,0,0.0,False,High-End,1,0
dce09d0f-2737-43ba-a5d5-dabba4779464,2022-02-01,7,3.5,470.6,133,16.0,0.0,0.0,0.0,0.15,0.14,0.0,0.7,0.0,0.43,0.57,67.23,1,133.25,1,25.0,True,Low-End,0,1


In [0]:
# Spliting the DataFrame in two: Training (the first two months) and Test (the last month)

train_data = (
    Predictions_Data
     .filter(f.col('month')>='2022-02-01')
     .filter(f.col('month')<'2022-04-01')
     .drop('Month')
)

In [0]:
test_data = (
    Predictions_Data
     .filter(f.col('Month')>='2022-04-01')
     .filter(f.col('Month')<'2022-05-01')
     .drop('Month')
)

**Question 3: Creating a Machine Learning pipeline to preprocess the data; Training and Tuning the model.**

**Logistic Regression:**

In [0]:
IDENTIFIERS = ["customer_id"]
 
CONTINUOUS_COLUMNS = [
    'Frequency',
    'Average_Time_to_Reorder',
    'Monetary',
    'Recency',
    'Convertion_rate_List3',
    'Average_Session_Time',
    'Average_Money_Spent',
    'Dinner_Ratio',
    'Lunch_Ratio',
    'BreakFast_Ratio',
    'Alc Beverages_Value_Ratio',
    'Vegetarian_Value_Ratio',
    'Burger_Value_Ratio',
    'Pizza_Value_Ratio',
    'Japanese_Value_Ratio',
    'Best_Discount_per_Month',
    'Average_Discount_Percentage',
    'Average_Order_Hour'
]
 
TARGET_COLUMN = ['target']

BINARY_COLUMNS = ['is_referee', 'Paid_Install']

CATEGORICAL_COLUMNS = ['device_type']

In [0]:
from pyspark.ml.feature import Imputer, VectorAssembler, StringIndexer, OneHotEncoder, MinMaxScaler, RobustScaler, PCA
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

impute_cor = Imputer(inputCols=CONTINUOUS_COLUMNS, outputCols=CONTINUOUS_COLUMNS)
assemble_cor = VectorAssembler(inputCols=CONTINUOUS_COLUMNS, outputCol='continuous_features_cor')

##### Correlation Matrix

In [0]:
train_data_transformed = assemble_cor.transform(train_data)
 
#train_data_transformed.display()

In [0]:
from pyspark.ml.stat import Correlation
import pandas as pd

correlation = Correlation.corr(
    train_data_transformed, "continuous_features_cor"
)

correlation_array = correlation.head()[0].toArray()

correlation_pd = pd.DataFrame(
    correlation_array,
    index=CONTINUOUS_COLUMNS,
    columns=CONTINUOUS_COLUMNS,
)

correlation.pd.display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-1397928146843179>[0m in [0;36m<module>[0;34m[0m
[1;32m      2[0m [0;32mimport[0m [0mpandas[0m [0;32mas[0m [0mpd[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0;34m[0m[0m
[0;32m----> 4[0;31m correlation = Correlation.corr(
[0m[1;32m      5[0m     [0mtrain_data_transformed[0m[0;34m,[0m [0;34m"continuous_features_cor"[0m[0;34m[0m[0;34m[0m[0m
[1;32m      6[0m )

[0;32m/databricks/spark/python/pyspark/ml/stat.py[0m in [0;36mcorr[0;34m(dataset, column, method)[0m
[1;32m    163[0m         [0mjavaCorrObj[0m [0;34m=[0m [0m_jvm[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0morg[0m[0;34m.[0m[0mapache[0m[0;34m.[0m[0mspark[0m[0;34m.[0m[0mml[0m[0;34m.[0m[0mstat[0m[0;34m.[0m[0mCorrelation[0m[0;34m[0m[0;34m[0m[0m
[1;32m    164[0m         [0m

##### Features with correlation - completar

##### PCA

In [0]:
IDENTIFIERS = ["customer_id"]
 
CONTINUOUS_COLUMNS = [
    'Frequency',
    'Average_Time_to_Reorder',
    'Recency',
    'Convertion_rate_List3',
    'Average_Session_Time',
    'BreakFast_Ratio',
    'Alc Beverages_Value_Ratio',
    'Vegetarian_Value_Ratio',
    'Burger_Value_Ratio',
    'Pizza_Value_Ratio',
    'Japanese_Value_Ratio',
    'Best_Discount_per_Month',
    'Average_Discount_Percentage',
]
 
PCA_COLUMNS = [
    'Monetary',
    'Average_Money_Spent',
    'Dinner_Ratio',
    'Lunch_Ratio',
    'Average_Order_Hour'
]    
    
TARGET_COLUMN = ['target']

BINARY_COLUMNS = ['is_referee', 'Paid_Install']

CATEGORICAL_COLUMNS = ['device_type']

In [0]:
impute_PCA = Imputer(inputCols=PCA_COLUMNS, outputCols=PCA_COLUMNS)
assemble_PCA = VectorAssembler(inputCols=PCA_COLUMNS, outputCol='pca_features')

train_data_transformed = assemble_PCA.transform(train_data)

In [0]:
pca = PCA(k=1, inputCol='pca_features', outputCol='pcaFeatures')
PCA_model = pca.fit(train_data_transformed)

PCA_model.explainedVariance
#PCA_model_transformed.show(truncate=False)

Out[56]: DenseVector([0.9985])

In [0]:
Impute_Null = Imputer(inputCols=['Frequency', 'Average_Time_to_Reorder'], outputCols=['Frequency', 'Average_Time_to_Reorder'])
Vector_Assembler = VectorAssembler(inputCols=['Frequency', 'Average_Time_to_Reorder', 'Monetary', 'Recency','Average_Money_Spent'], outputCol='Continuous_Features')
Index = StringIndexer(inputCols=['device_type'], outputCols=['Device_Type_IDX']) 
One_Hot_Enconder = OneHotEncoder(inputCol='Device_Type_IDX', outputCol='Device_Type_Vector')
Scaler = MinMaxScaler(inputCol='Continuous_Features', outputCol='Scaled_Continuous_Features')
Final_Vector_Assembler = VectorAssembler(inputCols=['Scaled_Continuous_Features', 'Device_Type_Vector', 'is_referee'], outputCol='Features')
Log_Regression = LogisticRegression(featuresCol="Features", labelCol="Target", predictionCol="prediction")

pipe_Regression = Pipeline()
pipe_Regression.setStages(
    [
        Impute_Null,
        Vector_Assembler,
        Index,
        One_Hot_Enconder,
        Scaler,
        Final_Vector_Assembler,
        Log_Regression
    ]
)

Out[57]: Pipeline_8fca826d7da4

In [0]:
#To train the model without any tuning, we do the fit. It will generate a model, to which we will apply a transform on the train data and generate the columns that will be generated by our pipeline. 

pipe_Regression_model = pipe_Regression.fit(train_data)
fitted_data_Regression = pipe_Regression_model.transform(train_data)
fitted_data_Regression.display()

In [0]:
#Evaluating the Model: 

from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
evaluator = BinaryClassificationEvaluator(
    labelCol="Target",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC",
)
 
metric = evaluator.evaluate(fitted_data_Regression)
print(f"Area under ROC = {metric} ")

In [0]:
# Tuning the Model (HyperParameter Tuning).

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = (
    ParamGridBuilder()
    .addGrid(Log_Regression.regParam, [0.5, 0.4, 0.03])
    .addGrid(Log_Regression.elasticNetParam, [0.0, 0.6, 1.0])
    .addGrid(Log_Regression.aggregationDepth, [2.0, 4.0, 7.0])
    .build()
)

In [0]:
!pip install mlflow --quiet

In [0]:
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
dbutils.fs.put("file:///root/.databrickscfg","[DEFAULT]\nhost=https://community.cloud.databricks.com\ntoken = "+token,overwrite=True)

In [0]:
from pyspark.ml.tuning import CrossValidator 
import mlflow
from mlflow import spark

mlflow.pyspark.ml.autolog()
 
mlflow.start_run()
CrossValidator = CrossValidator(
    estimator=pipe_Regression,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)
 
CrossValidator_Model = CrossValidator.fit(train_data)

In [0]:
import mlflow
from mlflow import spark

mlflow.pyspark.ml.autolog()
 
mlflow.start_run()
CrossValidator = CrossValidator(
    estimator=pipe_Regression,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=4
)
 
CrossValidator_Model = CrossValidator.fit(train_data)

In [0]:
mlflow.spark.log_model(CrossValidator_Model.bestModel, "model-file")# logs model as artifacts
mlflow.end_run()

In [0]:
The_Best_Model_Log_Regression = CrossValidator_Model.bestModel

In [0]:
Fitted_Test_Data = The_Best_Model_Log_Regression.transform(test_data)

In [0]:
Train_metric = evaluator.evaluate(fitted_data_Regression)
Test_metric = evaluator.evaluate(Fitted_Test_Data)

print(f"Area under ROC on TRAIN= {train_metric}")
print(f"Area under ROC on TEST= {test_metric}")

**Decision Tree:**

In [0]:
from pyspark.ml.feature import Imputer, VectorAssembler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

Impute_Null = Imputer(inputCols=['Frequency', 'Average_Time_to_Reorder'], outputCols=['Frequency', 'Average_Time_to_Reorder'])
Vector_Assembler = VectorAssembler(inputCols=['Frequency', 'Average_Time_to_Reorder', 'Monetary', 'Recency','Average_Money_Spent'], outputCol='Continuous_Features')
Index = StringIndexer(inputCols=['device_type'], outputCols=['Device_Type_IDX']) 
One_Hot_Enconder = OneHotEncoder(inputCol='Device_Type_IDX', outputCol='Device_Type_Vector')
Scaler = MinMaxScaler(inputCol='Continuous_Features', outputCol='Scaled_Continuous_Features')
Final_Vector_Assembler = VectorAssembler(inputCols=['Scaled_Continuous_Features', 'Device_Type_Vector', 'is_referee'], outputCol='Features')
Decision_Tree = DecisionTreeClassifier(featuresCol="Features", labelCol="Target", predictionCol="prediction")

pipe_DecisionTree = Pipeline()
pipe_DecisionTree.setStages(
    [
        Impute_Null,
        Vector_Assembler,
        Index,
        One_Hot_Enconder,
        Scaler,
        Final_Vector_Assembler,
        Decision_Tree
    ]
)

In [0]:
pipe_DecisionTree_model = pipe_DecisionTree.fit(train_data)
fitted_data_DecisionTree = pipe_DecisionTree_model.transform(train_data)
fitted_data_DecisionTree.display()

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
evaluator = BinaryClassificationEvaluator(
    labelCol="Target",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC",
)
 
metric = evaluator.evaluate(fitted_data_DecisionTree)
print(f"Area under ROC = {metric} ")

In [0]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(Decision_Tree.minInstancesPerNode, [2 , 4 , 6])
    .addGrid(Decision_Tree.maxBins, [25])
    .addGrid(Decision_Tree.maxDepth, [4, 5 , 10 ])
    .addGrid(Decision_Tree.minWeightFractionPerNode, [0.2 , 0.5])
    .build()
)

In [0]:
mlflow.pyspark.ml.autolog()
 
mlflow.start_run()
CrossValidator_DecisionTree = CrossValidator(
    estimator=pipe_DecisionTree,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)
 
CrossValidator_Model_DecisionTree = CrossValidator_DecisionTree.fit(train_data)

In [0]:
mlflow.spark.log_model(CrossValidator_Model_DecisionTree.bestModel, "model-file")# logs model as artifacts
mlflow.end_run()

In [0]:
The_Best_Model_DecisionTree = CrossValidator_Model_DecisionTree.bestModel

In [0]:
Fitted_Test_Data_DecisionTree = The_Best_Model_DecisionTree.transform(test_data)

In [0]:
Train_metric_DecisionTree = evaluator.evaluate(fitted_data_DecisionTree)
Test_metric_DecisionTree = evaluator.evaluate(Fitted_Test_Data_DecisionTree)

print(f"Area under ROC on TRAIN= {train_metric}")
print(f"Area under ROC on TEST= {test_metric}")

**Random Forest:**

In [0]:
from pyspark.ml.feature import Imputer, VectorAssembler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

Impute_Null = Imputer(inputCols=['Frequency', 'Average_Time_to_Reorder'], outputCols=['Frequency', 'Average_Time_to_Reorder'])
Vector_Assembler = VectorAssembler(inputCols=['Frequency', 'Average_Time_to_Reorder', 'Monetary', 'Recency','Average_Money_Spent'], outputCol='Continuous_Features')
Index = StringIndexer(inputCols=['device_type'], outputCols=['Device_Type_IDX']) 
One_Hot_Enconder = OneHotEncoder(inputCol='Device_Type_IDX', outputCol='Device_Type_Vector')
Scaler = MinMaxScaler(inputCol='Continuous_Features', outputCol='Scaled_Continuous_Features')
Final_Vector_Assembler = VectorAssembler(inputCols=['Scaled_Continuous_Features', 'Device_Type_Vector', 'is_referee'], outputCol='Features')
Random_Forest = RandomForestClassifier(featuresCol="Features", labelCol="Target", predictionCol="prediction")

pipe_RandomForest = Pipeline()
pipe_RandomForest.setStages(
    [
        Impute_Null,
        Vector_Assembler,
        Index,
        One_Hot_Enconder,
        Scaler,
        Final_Vector_Assembler,
        Random_Forest
    ]
)

In [0]:
pipe_RandomForest_model = pipe_RandomForest.fit(train_data)
fitted_data_RandomForest = pipe_RandomForest_model.transform(train_data)
fitted_data_RandomForest.display()

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
evaluator = BinaryClassificationEvaluator(
    labelCol="Target",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC",
)
 
metric = evaluator.evaluate(fitted_data_RandomForest)
print(f"Area under ROC = {metric} ")

In [0]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(rf.maxBins, [2, 10, 25])
    .addGrid(rf.maxDepth, [5, 10 , 20 ])
    .addGrid(rf.subsamplingRate, [0.0, 0.5 , 1])
    .build()
)

In [0]:
mlflow.pyspark.ml.autolog()
 
mlflow.start_run()
CrossValidator_RandomForest = CrossValidator(
    estimator=pipe_RandomForest,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)
 
CrossValidator_Model_RandomForest = CrossValidator_RandomForest.fit(train_data)

In [0]:
mlflow.spark.log_model(CrossValidator_Model_RandomForest.bestModel, "model-file")# logs model as artifacts
mlflow.end_run()

In [0]:
The_Best_Model_RandomForest = CrossValidator_Model_RandomForest.bestModel

In [0]:
Fitted_Test_Data_RandomForest = The_Best_Model_RandomForest.transform(test_data)

In [0]:
Train_metric_DecisionTree = evaluator.evaluate(fitted_data_RandomForest)
Test_metric_DecisionTree = evaluator.evaluate(Fitted_Test_Data_RandomForest)

print(f"Area under ROC on TRAIN= {train_metric}")
print(f"Area under ROC on TEST= {test_metric}")