##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [1]:
#%pip install pandas 
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

In [2]:
# !pip uninstall scikit-learn --yes
# !pip uninstall imblearn --yes
# !pip install scikit-learn==1.2.2
# !pip install imblearn

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [3]:
import seaborn as sns
import statistics
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Can have as many cells as you want for code
import pandas as pd
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

In [None]:
df = pd.read_parquet(filepath)
df.head()

In [6]:
#replacing all na values to 0 since there is initially 2 distinct values in this column, na values and 1.
df["f_purchase_lh"]=df["f_purchase_lh"].fillna(0)

We drop these columns because these columns either have too many missing values or all of the values are None

In [7]:
df = df.drop(columns=pd.Index(["flg_affconnect_lapse_ever", "hlthclaim_cnt_success","giclaim_cnt_success","recency_cancel", "recency_lapse"]))

we convert these two columns that has values to 1 and those without values to be 0, those with numbers will be considered recent those without will be cosnidered not recent or never visited (caution run once only if not code will be executed twice and the 0 will now be changed to 1)

In [8]:
#We decided to drop the following columns with `ape_`, `sumins_`, `prempaid_*` because the customer would not know these information. 
#Hence, it will not affect their decision-making of whether they would buy the insurance (target variable)
spike_cols = [col for col in df.columns if 'ape_' in col[:4]]
df = df.drop(columns = pd.Index(spike_cols))

sumins_cols = [col for col in df.columns if 'sumins_' in col[:len("sumins_")]]
df = df.drop(columns = pd.Index(sumins_cols))

prempaid_cols = [col for col in df.columns if 'prempaid_' in col[:len("prempaid_")]]
df = df.drop(columns = pd.Index(prempaid_cols))

### The following columns has nothing to do with the target variable, hence it will be deleted


In [9]:
#drop these agents specific parameters as it has no influence on customer purchasing decision

for names in ["clmcon_visit_days", "recency_clmcon", "recency_clm_regis", "flg_hlthclaim_", "flg_gi_claim_" , "f_ever_bought_", "n_months_last_bought" , "lapse_ape_", "n_months_since_lapse_", "cltsex_fix"]:
    spike_cols = [col for col in df.columns if names in col[:len(names)]]
    df = df.drop(columns = pd.Index(spike_cols))

df = df.drop(columns = pd.Index(["clttype", "stat_flag", "min_occ_date", "recency_giclaim_success", "giclaim_cnt_unsuccess", "recency_giclaim_unsuccess"]), axis = 1)

In [10]:
#converting nan values into 0 while converting those postiive values into 1
# 1 to indicate recency and 0 to indicate non-recent or never claim
df["recency_giclaim"] = df["recency_giclaim"].apply(lambda x: 1 if pd.notna(x) else 0)
df["recency_hlthclaim"] = df["recency_hlthclaim"].apply(lambda x: 1 if pd.notna(x) else 0)

#### For the two columns that are in the code below, we converted them to float since they are in Object datatype initially and filled the missing values with the median

In [11]:
df["giclaim_amt"] = df["giclaim_amt"].astype("float64")
gi_claim_median = df["giclaim_amt"].median()
df["giclaim_amt"] = df["giclaim_amt"].fillna(gi_claim_median)

df["hlthclaim_amt"] = df["hlthclaim_amt"].astype("float64")
hlthclaim_median = df["hlthclaim_amt"].median()
df["hlthclaim_amt"] = df["hlthclaim_amt"].fillna(hlthclaim_median)

In [12]:
# the following column refers to the Total number of in-force and canceled policies.
# the unique values are [nan  1.  3.  2.  4.  6.]
# nan would mean 0 cancelled policies
df["tot_cancel_pols"]=df["tot_cancel_pols"].fillna(0)

In [13]:
non_numeric_cols = df.select_dtypes(include=["string", "object"]).columns
df_numeric = df.drop(columns=non_numeric_cols)

In [None]:
df.shape

In [None]:
# Checking the proportion of NA values in each columns
df.isna().sum()/(df.shape[0])

In [None]:
# find the distinct/unique values in each numerical columns of the dataset
id = 1
for col in df_numeric.columns:
    print(id, col, ":  ", df[col].unique())
    id += 1

#### "is_dependent_in_at_least_1_policy", "f_ever_declined_la", "flg_affconnect_show_interest_ever", "flg_affconnect_ready_to_buy_ever", "affcon_visit_days" are columns that have either [nan 1] or [0 nan], or in a similar form. Hence, nan values will be converted to either 0 or 1 depending on the counterpart

In [17]:
df["flg_affconnect_ready_to_buy_ever"] = df["flg_affconnect_ready_to_buy_ever"].fillna(0)
df["flg_affconnect_show_interest_ever"] = df["flg_affconnect_show_interest_ever"].fillna(0)
df["f_ever_declined_la"] = df["f_ever_declined_la"].fillna(0)
df["is_dependent_in_at_least_1_policy"] = df["is_dependent_in_at_least_1_policy"].fillna(1)

In [18]:

df= df.drop(["clntnum", "race_desc"], axis=1)

## Majority of the clients are Singaporean as shown below. Hence, we are going to focus on Singaporean clients

In [None]:
df["ctrycode_desc"]

In [None]:
df["ctrycode_desc"].value_counts()

In [21]:
df = df[df["ctrycode_desc"] == "Singapore"]
df = df.drop("ctrycode_desc", axis = 1)

In [None]:
id = 1
for col in df.columns:
    print(id, col, ":  ", df[col].unique())
    id += 1

#### Editing column feature: DOB -> Age
Filling missing values with the median age

In [23]:
# converting DOB to age
age_list = list()
for x in df["cltdob_fix"]:
    if x.lower() != "none":
        year = int(x[:4])
        age = 2024 - year
        age_list.append(age)

    
df["cltdob_fix"] = pd.Series(age_list)
median_value = df["cltdob_fix"].median()
df["cltdob_fix"] = df["cltdob_fix"].replace({None: np.nan})
df["cltdob_fix"] = df["cltdob_fix"].fillna(median_value)

### Using hh_20 column and pop_20 columns as the X_train data for KNN imputation of hh_size_est since there are links between these 3 columns. This is because we checked that there are the same number of missing values for the three columns

In [24]:
print(sum(df["hh_20"].isna()))

2610


In [None]:
print(sum(df["pop_20"].isna()))

In [None]:
print(df["hh_20"].value_counts().sum())
print(df.shape[0] - df["hh_20"].value_counts().sum())

In [27]:
df["hh_20"] = df["hh_20"].fillna(-1)
df["pop_20"] = df["pop_20"].fillna(-1)

In [28]:
df["hh_20"] = df["hh_20"].astype(int)
df["pop_20"] = df["pop_20"].astype(int)

In [29]:
#calculating median value for hh_20 column
hh_20_lst = list()
for i in df["hh_20"]:
    if i != -1:
        hh_20_lst.append(i)
hh_20_median = statistics.median(hh_20_lst)
print(hh_20_median)


88


In [None]:
#calculating median value for pop_20 column
pop_20_lst = list()
for i in df["pop_20"]:
    if i != -1:
        pop_20_lst.append(i)
pop_20_median = statistics.median(pop_20_lst)
print(pop_20_median)

In [31]:
df["hh_20"] = df["hh_20"].replace(-1, hh_20_median)

In [32]:
sum(df["hh_20"] == -1)

0

In [33]:
df["pop_20"] = df["pop_20"].replace(-1, hh_20_median)

In [34]:
sum(df["pop_20"] == -1)

0

In [None]:
print(sum(df["hh_size_est"].isna()))

In [36]:
df["hh_size_est"] = df["hh_size_est"].replace(">4", "5")

In [None]:
df["hh_size_est"].value_counts()

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=15)
imputed_data = imputer.fit_transform(df[["pop_20","hh_20","hh_size_est"]]).round()
imputed_data

In [39]:
df["hh_size_est"] = pd.DataFrame(imputed_data[:,2])

## Since there are still NA values after performing imputation using KNN, we decided to use DecisionTreeClassifier to impute the remaining missing values

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc_x_train = df.dropna(subset = ["hh_size_est"])[["hh_20", "pop_20"]]
dtc_y_train = df.dropna(subset=["hh_size_est"])["hh_size_est"]

dtc_x_test = df[df["hh_size_est"].isna()][["hh_20", "pop_20"]]

clf = DecisionTreeClassifier()
clf.fit(dtc_x_train, dtc_y_train)

dtc_y_predicted = clf.predict(dtc_x_test)
print(dtc_y_predicted)

In [None]:
print(set(dtc_y_predicted))

In [42]:
df.loc[df["hh_size_est"].isna(), "hh_size_est"] = dtc_y_predicted

In [43]:
print(sum(df["hh_size_est"].isna()))

0


#### Now, all the missing values of the hh_size_est column are fully imputed

In [None]:
from sklearn.preprocessing import OneHotEncoder
hh_size_est_lst_numpy_array = np.array(df["hh_size_est"]).reshape(-1,1)
label_encoding = OneHotEncoder()
encoded = label_encoding.fit(hh_size_est_lst_numpy_array)
print(encoded.transform(hh_size_est_lst_numpy_array).toarray())

In [45]:
one_hot_encoding_hh_size_est = encoded.transform(hh_size_est_lst_numpy_array).toarray()
type(one_hot_encoding_hh_size_est)

numpy.ndarray

In [None]:
one_hot_encoding_hh_size_est_T = one_hot_encoding_hh_size_est.T
print(one_hot_encoding_hh_size_est_T)

In [47]:
some_id = 0
for i in ["0", "1", "2", "3", "4", ">4"]:
    name = "hh_size_est_" + i
    df[name] = one_hot_encoding_hh_size_est_T[some_id]
    some_id += 1

In [None]:
encoded.transform(np.array([0,1,2,3,4,5]).reshape(-1,1)).toarray()

In [None]:
one_hot_encoding_hh_size_est_T.shape

In [None]:
df.shape

In [None]:
df

## Working on annual_income_est column

In [None]:
dic_annual_income_est = {'C.60K-100K':3, 'D.30K-60K':4, 'A.ABOVE200K':1, 'B.100K-200K':2, 'E.BELOW30K':5}
annual_income_est_lst = list()
for i in df["annual_income_est"]:
    if dic_annual_income_est.get(i):
        annual_income_est_lst.append(dic_annual_income_est.get(i))
    else:
        annual_income_est_lst.append(i)
print(annual_income_est_lst)

In [53]:
df["annual_income_est_label_encode"] = annual_income_est_lst

In [None]:
df["annual_income_est_label_encode"].isna().sum()

##### annual_income_est_label_encode also have the same number of missing data as the 3 other columns (before these 3 columns are imputed): hh_20, pp_20 and hh_size_est. Hence, this is a good basis for the assumption that there is correlation between these 4 columns. hh_20, pp_20 and hh_size_est will be used as X_train for imputation using DecisionTreeClassifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc_x_train_aie = df.dropna(subset = ["annual_income_est"])[["hh_20", "pop_20","hh_size_est"]]
dtc_y_train_aie = df.dropna(subset=["annual_income_est"])["annual_income_est"]

dtc_x_test_aie = df[df["annual_income_est"].isna()][["hh_20", "pop_20", "hh_size_est"]]

clf2 = DecisionTreeClassifier()
clf2.fit(dtc_x_train_aie, dtc_y_train_aie)

dtc_y_predicted_aie = clf2.predict(dtc_x_test_aie)
print(dtc_y_predicted_aie)

In [56]:
df.loc[df["annual_income_est"].isna(), "annual_income_est"] = dtc_y_predicted_aie

##### one hot encoding for annual_income_est

In [None]:
annual_income_est_np_array = np.array(df["annual_income_est"]).reshape(-1,1)
label_encoding2 = OneHotEncoder()
encoded2 = label_encoding2.fit(annual_income_est_np_array)
print(encoded2.transform(annual_income_est_np_array).toarray())

In [None]:
encoded2.transform(np.array(['C.60K-100K','D.30K-60K','A.ABOVE200K','B.100K-200K','E.BELOW30K']).reshape(-1,1)).toarray()

In [None]:
df["annual_income_est"].value_counts()

In [None]:
annual_income_est_T = encoded2.transform(annual_income_est_np_array).toarray().T
annual_income_est_T

In [61]:
some_index = 0
for i in ['A.ABOVE200K','B.100K-200K','C.60K-100K','D.30K-60K','E.BELOW30K']:
    name = "annual_income_est_" + i
    df[name] = annual_income_est_T[some_index]
    some_index += 1
df = df.drop("annual_income_est", axis = 1)

Fill missing values of the numeric columns with the median

In [62]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

In [None]:
X = df.drop(columns=["f_purchase_lh"])
y = df["f_purchase_lh"]

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Before:', Counter(y_train))
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print('After:', Counter(y_train))


from sklearn.ensemble import RandomForestClassifier 
 
model = RandomForestClassifier() 
model.fit(X_train, y_train) 
 
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns) 
top_features = feature_importances.nlargest(20).index 
X_train_selected = X_train[top_features] 
print(top_features)


# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# # instantiate the model (using the default parameters)
# logreg3 = LogisticRegression(random_state=42)

# # fit the model with data
# logreg3.fit(X_train_selected, y_train)

# y_pred = logreg3.predict(X_test[top_features])


from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay 
from sklearn.tree import DecisionTreeClassifier 
 
dt_clf = DecisionTreeClassifier() 
dt_clf.fit(X_train, y_train) 
 
y_test_pred = dt_clf.predict(X_test) 
print(classification_report(y_test, y_test_pred))

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [64]:
def testing_hidden_data(df: pd.DataFrame) -> list:


    df = df.drop(columns=pd.Index(["flg_affconnect_lapse_ever", "hlthclaim_cnt_success","giclaim_cnt_success","recency_cancel", "recency_lapse"]))
    spike_cols = [col for col in df.columns if 'ape_' in col[:4]]
    df = df.drop(columns = pd.Index(spike_cols))

    sumins_cols = [col for col in df.columns if 'sumins_' in col[:len("sumins_")]]
    df = df.drop(columns = pd.Index(sumins_cols))

    prempaid_cols = [col for col in df.columns if 'prempaid_' in col[:len("prempaid_")]]
    df = df.drop(columns = pd.Index(prempaid_cols))

    for names in ["clmcon_visit_days", "recency_clmcon", "recency_clm_regis", "flg_hlthclaim_", "flg_gi_claim_" , "f_ever_bought_", "n_months_last_bought" , "lapse_ape_", "n_months_since_lapse_", "cltsex_fix"]:
        spike_cols = [col for col in df.columns if names in col[:len(names)]]
        df = df.drop(columns = pd.Index(spike_cols))

    df = df.drop(columns = pd.Index(["clttype", "stat_flag", "min_occ_date", "recency_giclaim_success", "giclaim_cnt_unsuccess", "recency_giclaim_unsuccess"]), axis = 1)
    df["recency_giclaim"] = df["recency_giclaim"].apply(lambda x: 1 if pd.notna(x) else 0)
    df["recency_hlthclaim"] = df["recency_hlthclaim"].apply(lambda x: 1 if pd.notna(x) else 0)
    df["giclaim_amt"] = df["giclaim_amt"].astype("float64")
    gi_claim_median = df["giclaim_amt"].median()
    df["giclaim_amt"] = df["giclaim_amt"].fillna(gi_claim_median)
    df["hlthclaim_amt"] = df["hlthclaim_amt"].astype("float64")
    hlthclaim_median = df["hlthclaim_amt"].median()
    df["hlthclaim_amt"] = df["hlthclaim_amt"].fillna(hlthclaim_median)
    df["tot_cancel_pols"]=df["tot_cancel_pols"].fillna(0)
    non_numeric_cols = df.select_dtypes(include=["string", "object"]).columns
    df_numeric = df.drop(columns=non_numeric_cols)
    id = 1
    
    df["flg_affconnect_ready_to_buy_ever"] = df["flg_affconnect_ready_to_buy_ever"].fillna(0)
    df["flg_affconnect_show_interest_ever"] = df["flg_affconnect_show_interest_ever"].fillna(0)
    df["f_ever_declined_la"] = df["f_ever_declined_la"].fillna(0)
    df["is_dependent_in_at_least_1_policy"] = df["is_dependent_in_at_least_1_policy"].fillna(1)

    df= df.drop(["clntnum", "race_desc"], axis=1)
    ## Majority of the clients are Singaporean as shown below. Hence, we are going to focus on Singaporean clients
    df["ctrycode_desc"]
    df["ctrycode_desc"].value_counts()
    df = df[df["ctrycode_desc"] == "Singapore"]
    df = df.drop("ctrycode_desc", axis = 1)
    id = 1
    
    #### Editing column feature: DOB -> Age
  
    # converting DOB to age
    age_list = list()
    for x in df["cltdob_fix"]:
        if x.lower() != "none":
            year = int(x[:4])
            age = 2024 - year
            age_list.append(age)

        
    df["cltdob_fix"] = pd.Series(age_list)
    median_value = df["cltdob_fix"].median()
    df["cltdob_fix"] = df["cltdob_fix"].replace({None: np.nan})
    df["cltdob_fix"] = df["cltdob_fix"].fillna(median_value)
    ### Using hh_20 column and pop_20 columns as the X_train data for KNN imputation of hh_size_est since there are links between these 3 columns. This is because we checked that there are the same number of missing values for the three columns
    
    df["hh_20"] = df["hh_20"].fillna(-1)
    df["pop_20"] = df["pop_20"].fillna(-1)
    df["hh_20"] = df["hh_20"].astype(int)
    df["pop_20"] = df["pop_20"].astype(int)
    hh_20_lst = list()
    for i in df["hh_20"]:
        if i != -1:
            hh_20_lst.append(i)
    hh_20_median = statistics.median(hh_20_lst)
  
    pop_20_lst = list()
    for i in df["pop_20"]:
        if i != -1:
            pop_20_lst.append(i)
    pop_20_median = statistics.median(pop_20_lst)

    df["hh_20"] = df["hh_20"].replace(-1, hh_20_median)
    sum(df["hh_20"] == -1)
    df["pop_20"] = df["pop_20"].replace(-1, hh_20_median)
    sum(df["pop_20"] == -1)
    
    df["hh_size_est"] = df["hh_size_est"].replace(">4", "5")
    df["hh_size_est"].value_counts()
    from sklearn.impute import KNNImputer

    imputer = KNNImputer(n_neighbors=15)
    imputed_data = imputer.fit_transform(df[["pop_20","hh_20","hh_size_est"]]).round()
    imputed_data
    df["hh_size_est"] = pd.DataFrame(imputed_data[:,2])
    from sklearn.tree import DecisionTreeClassifier
    dtc_x_train = df.dropna(subset = ["hh_size_est"])[["hh_20", "pop_20"]]
    dtc_y_train = df.dropna(subset=["hh_size_est"])["hh_size_est"]

    dtc_x_test = df[df["hh_size_est"].isna()][["hh_20", "pop_20"]]

    clf = DecisionTreeClassifier()
    clf.fit(dtc_x_train, dtc_y_train)

    dtc_y_predicted = clf.predict(dtc_x_test)
    
    df.loc[df["hh_size_est"].isna(), "hh_size_est"] = dtc_y_predicted

    #### Now, all the missing values of the hh_size_est column are fully imputed
    from sklearn.preprocessing import OneHotEncoder
    hh_size_est_lst_numpy_array = np.array(df["hh_size_est"]).reshape(-1,1)
    label_encoding = OneHotEncoder()
    encoded = label_encoding.fit(hh_size_est_lst_numpy_array)
    
    one_hot_encoding_hh_size_est = encoded.transform(hh_size_est_lst_numpy_array).toarray()
    type(one_hot_encoding_hh_size_est)
    one_hot_encoding_hh_size_est_T = one_hot_encoding_hh_size_est.T
    some_id = 0
    for i in ["0", "1", "2", "3", "4", ">4"]:
        name = "hh_size_est_" + i
        df[name] = one_hot_encoding_hh_size_est_T[some_id]
        some_id += 1
    encoded.transform(np.array([0,1,2,3,4,5]).reshape(-1,1)).toarray()
    ## Working on annual_income_est column
    dic_annual_income_est = {'C.60K-100K':3, 'D.30K-60K':4, 'A.ABOVE200K':1, 'B.100K-200K':2, 'E.BELOW30K':5}
    annual_income_est_lst = list()
    for i in df["annual_income_est"]:
        if dic_annual_income_est.get(i):
            annual_income_est_lst.append(dic_annual_income_est.get(i))
        else:
            annual_income_est_lst.append(i)
    
    df["annual_income_est_label_encode"] = annual_income_est_lst
    df["annual_income_est_label_encode"].isna().sum()
    ##### annual_income_est_label_encode also have the same number of missing data as the 3 other columns (before these 3 columns are imputed): hh_20, pp_20 and hh_size_est. Hence, this is a good basis for the assumption that there is correlation between these 4 columns. hh_20, pp_20 and hh_size_est will be used as X_train for imputation using DecisionTreeClassifier 
    from sklearn.tree import DecisionTreeClassifier
    dtc_x_train_aie = df.dropna(subset = ["annual_income_est"])[["hh_20", "pop_20","hh_size_est"]]
    dtc_y_train_aie = df.dropna(subset=["annual_income_est"])["annual_income_est"]

    dtc_x_test_aie = df[df["annual_income_est"].isna()][["hh_20", "pop_20", "hh_size_est"]]

    clf2 = DecisionTreeClassifier()
    clf2.fit(dtc_x_train_aie, dtc_y_train_aie)

    dtc_y_predicted_aie = clf2.predict(dtc_x_test_aie)
    
    df.loc[df["annual_income_est"].isna(), "annual_income_est"] = dtc_y_predicted_aie
    ##### one hot encoding for annual_income_est
    annual_income_est_np_array = np.array(df["annual_income_est"]).reshape(-1,1)
    label_encoding2 = OneHotEncoder()
    encoded2 = label_encoding2.fit(annual_income_est_np_array)
    
    encoded2.transform(np.array(['C.60K-100K','D.30K-60K','A.ABOVE200K','B.100K-200K','E.BELOW30K']).reshape(-1,1)).toarray()
    df["annual_income_est"].value_counts()
    annual_income_est_T = encoded2.transform(annual_income_est_np_array).toarray().T
    annual_income_est_T
    some_index = 0
    for i in ['A.ABOVE200K','B.100K-200K','C.60K-100K','D.30K-60K','E.BELOW30K']:
        name = "annual_income_est_" + i
        df[name] = annual_income_est_T[some_index]
        some_index += 1
    df = df.drop("annual_income_est", axis = 1)

    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))
    
    y_test_pred = dt_clf.predict(df) 





    result = list(y_test_pred) 
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

In [66]:
y_pred = testing_hidden_data(test_df)
print(classification_report(df["f_purchase_lh"], y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     17079
         1.0       0.80      0.86      0.83       710

    accuracy                           0.99     17789
   macro avg       0.90      0.93      0.91     17789
weighted avg       0.99      0.99      0.99     17789



### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!