##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [1]:
%pip install pandas 
%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [2]:
# Can have as many cells as you want for code
import pandas as pd
import numpy as np
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [164]:
df = pd.read_parquet(filepath)

In [167]:
df.head()

Unnamed: 0,clntnum,race_desc,ctrycode_desc,clttype,stat_flag,min_occ_date,cltdob_fix,cltsex_fix,flg_substandard,flg_is_borderline_standard,...,recency_giclaim,giclaim_cnt_success,recency_giclaim_success,giclaim_cnt_unsuccess,recency_giclaim_unsuccess,flg_gi_claim_29d435_ever,flg_gi_claim_058815_ever,flg_gi_claim_42e115_ever,flg_gi_claim_856320_ever,f_purchase_lh
19550,91b546e924,Chinese,Singapore,P,ACTIVE,2017-10-31,1974-05-09,Female,0.0,0.0,...,,,,,,,,,,
4600,896bae548c,Chinese,Singapore,P,ACTIVE,2007-05-23,1979-11-11,Male,0.0,0.0,...,,,,,,,,,,
13337,f364439ae6,Others,Singapore,P,ACTIVE,2019-08-31,1976-01-28,Male,0.0,0.0,...,,,,,,,,,,
15074,70f319cfe1,Chinese,Singapore,P,ACTIVE,2021-10-18,1976-03-19,Female,0.0,0.0,...,,,,,,,,,,
19724,2647a81328,Chinese,Singapore,P,ACTIVE,2018-07-20,1995-07-31,Female,0.0,0.0,...,,,,,,,,,,


# General Client Information

- Almost all columns have None 
- meaning of P,G,C for customer type -> personal, group and corporate? This could be correlated to e.g. household information, possibly if they purchase as a family

In [5]:
def clean_general_info(df):
    df1 = df.copy()

    # races: None in race_desc should be replaced as a value. Either unknown or others 
    df1['race_desc'] = df1['race_desc'].fillna("Others")

    # country: None should be replaced with Not Applicable 
    # Question: Need to keep both Not applicable and unknown country code? 
    df1['ctrycode_desc'] = df1['ctrycode_desc'].fillna("Not Applicable")

    # min_occ_date: calculate the duration this customer has been with singlife 
    # convert to datetime 
    df1['min_occ_date'] = pd.to_datetime(df1['min_occ_date'], errors='coerce')
    years_with_singlife = (pd.to_datetime('now') - df1['min_occ_date']).dt.days / 365.25
    mean_duration = years_with_singlife.median() # pick median instead of mean 
    df1['years_with_company'] = years_with_singlife.fillna(mean_duration).astype(int)

    # cltdob_fix: calculate age
    df1['cltdob_fix'] = pd.to_datetime(df1['cltdob_fix'], errors='coerce')
    age_in_years = (pd.to_datetime('now') - df1['cltdob_fix']).dt.days / 365.25
    mean_age = age_in_years.mean()
    df1['age'] = age_in_years.fillna(mean_age).astype(int)
    # dropping DOB column 
    df1.drop('cltdob_fix', axis=1, inplace=True)

    # None value in gender 
    df1['cltsex_fix'].unique()
    # method 1: if None then male, method2: dropna, here I used method 1
    df1['cltsex_fix'] = df1['cltsex_fix'].fillna('Male')

    return df1 



In [331]:
df1 = clean_general_info(df)
df1.head()

Unnamed: 0,clntnum,race_desc,ctrycode_desc,clttype,stat_flag,min_occ_date,cltsex_fix,flg_substandard,flg_is_borderline_standard,flg_is_revised_term,...,recency_giclaim_success,giclaim_cnt_unsuccess,recency_giclaim_unsuccess,flg_gi_claim_29d435_ever,flg_gi_claim_058815_ever,flg_gi_claim_42e115_ever,flg_gi_claim_856320_ever,f_purchase_lh,years_with_company,age
19550,91b546e924,Chinese,Singapore,P,ACTIVE,2017-10-31,Female,0.0,0.0,0.0,...,,,,,,,,,6,49
4600,896bae548c,Chinese,Singapore,P,ACTIVE,2007-05-23,Male,0.0,0.0,0.0,...,,,,,,,,,16,44
13337,f364439ae6,Others,Singapore,P,ACTIVE,2019-08-31,Male,0.0,0.0,0.0,...,,,,,,,,,4,48
15074,70f319cfe1,Chinese,Singapore,P,ACTIVE,2021-10-18,Female,0.0,0.0,0.0,...,,,,,,,,,2,47
19724,2647a81328,Chinese,Singapore,P,ACTIVE,2018-07-20,Female,0.0,0.0,0.0,...,,,,,,,,,5,28


# Purchase History
- Average Purchase Period/ Frequency
- 

N Last Months Purchase

In [332]:
last_bought_months = df1.filter(regex='n_months_last_bought_.*', axis=1).columns

for column in last_bought_months:
    df1[column] = df1[column].fillna(0).astype(float).astype(int)


df1[last_bought_months] = df1[last_bought_months].replace(9999, 0)

df1['n_months_last_bought_products'] = df1['n_months_last_bought_products'].fillna(0)

In [333]:
df1['Combined_List_Bought'] = df1[last_bought_months].apply(lambda row: sorted([val for val in row.tolist() if val != 0]), axis=1)
df1['Bought_Gap_List'] = df1['Combined_List_Bought'].apply(lambda sorted_list: [b - a for a, b in zip(sorted_list[:-1], sorted_list[1:])])
df1['Average_Bought_Gap'] = df1['Bought_Gap_List'].apply(lambda gaps: np.mean(gaps) if gaps else 0)


In [334]:
df1['Average_Bought_Gap'][df1['Average_Bought_Gap'] !=0]

19550     1.250000
4600     47.500000
11816    10.666667
14245    13.500000
10520     7.000000
           ...    
14423     7.750000
4426     17.250000
6265     21.666667
11284    12.500000
860      37.000000
Name: Average_Bought_Gap, Length: 6606, dtype: float64

In [335]:
df1['Exceed Average Gap'] = df1['Average_Bought_Gap'] <= df1['n_months_last_bought_products']
sum(df1['Exceed Average Gap'])

15495

In [336]:
df1['Exceed Average Gap with Purchase'] = (df1['Average_Bought_Gap'] <= df1['n_months_last_bought_products']) & (df1['Average_Bought_Gap'] != 0)
sum(df1['Exceed Average Gap with Purchase'])

4109

In [337]:
df1['Average_Bought_Gap'][df1['Average_Bought_Gap']>=36]

4600      47.500000
4461      46.500000
14569     44.666667
19042    112.500000
7946      37.000000
            ...    
19315     44.750000
15707     65.500000
5056      40.000000
12185    129.000000
860       37.000000
Name: Average_Bought_Gap, Length: 893, dtype: float64

In [338]:
bins = [-float('inf'), 3, 6, 12, 24, 36, 60, 120, float('inf')]
labels = ['<3 months', '3-6 months', '6 months - 1 year', '1-2 years', '2-3 years', '3-5 years', '5-10 years', '> 10 years']

df1['Months_Last_Bought_Category'] = pd.cut(df1['n_months_last_bought_products'], bins=bins, labels=labels)

In [339]:
df1['Months_Last_Bought_Category']

19550     <3 months
4600      3-5 years
13337     3-5 years
15074     1-2 years
19724    5-10 years
            ...    
11284     2-3 years
11964    > 10 years
5390     5-10 years
860       3-5 years
15795     1-2 years
Name: Months_Last_Bought_Category, Length: 17992, dtype: category
Categories (8, object): ['<3 months' < '3-6 months' < '6 months - 1 year' < '1-2 years' < '2-3 years' < '3-5 years' < '5-10 years' < '> 10 years']

In [340]:
column_patterns = ['ape_', 'sumins_', 'prempaid_']
regex_pattern = '^(' + '|'.join(column_patterns) + ')'
claimed_amount = df1.filter(regex=regex_pattern, axis=1).columns

df1['Claimed_Large_Amount'] = df1[claimed_amount].ge(50000).any(axis=1)
sum(df1['Claimed_Large_Amount'])

7990

In [342]:
column_patterns = ['lapse_']
regex_pattern = '^(' + '|'.join(column_patterns) + ')'
lapsed_amount = df1.filter(regex=regex_pattern, axis=1).columns

df1['Lapsed_Large_Amount'] = df1[lapsed_amount].ge(200).any(axis=1)
sum(df1['Lapsed_Large_Amount'])

2461

In [343]:
last_lapse_months =  df1.filter(regex='n_months_since_lapse.*', axis=1).columns


#df1[last_lapse_months] = df1[last_lapse_months].replace('9999', 0)
for column in last_lapse_months:
    df1[column] = df1[column].fillna(0).astype(float).astype(int)


df1[last_lapse_months] = df1[last_lapse_months].replace(9999, 0)


In [344]:
df1['Combined_List_Lapse'] = df1[last_lapse_months].apply(lambda row: sorted([val for val in row.tolist() if val != 0]), axis=1)
df1['Lapse_Gap_List'] = df1['Combined_List_Lapse'].apply(lambda sorted_list: [b - a for a, b in zip(sorted_list[:-1], sorted_list[1:])])
df1['Average_Lapse_Gap'] = df1['Lapse_Gap_List'].apply(lambda gaps: np.mean(gaps) if gaps else 0)

In [345]:
columns_to_drop = []

column_patterns = ['f_ever_bought']
regex_pattern = '^(' + '|'.join(column_patterns) + ')'
ever_bought = df1.filter(regex=regex_pattern, axis=1).columns

columns_to_drop.extend(last_bought_months)
columns_to_drop.extend(claimed_amount)
columns_to_drop.extend(last_lapse_months)
columns_to_drop.extend(lapsed_amount)
columns_to_drop.extend(ever_bought)
columns_to_drop.extend(['Combined_List_Lapse','Lapse_Gap_List','Combined_List_Bought', 'Bought_Gap_List'])
df1.drop(columns=columns_to_drop, inplace=True)

In [346]:
df1.iloc[:,70:]

Unnamed: 0,recency_giclaim_success,giclaim_cnt_unsuccess,recency_giclaim_unsuccess,flg_gi_claim_29d435_ever,flg_gi_claim_058815_ever,flg_gi_claim_42e115_ever,flg_gi_claim_856320_ever,f_purchase_lh,years_with_company,age,Average_Bought_Gap,Exceed Average Gap,Exceed Average Gap with Purchase,Months_Last_Bought_Category,Claimed_Large_Amount,Lapsed_Large_Amount,Average_Lapse_Gap
19550,,,,,,,,,6,49,1.25,False,False,<3 months,False,False,0.0
4600,,,,,,,,,16,44,47.50,False,False,3-5 years,False,True,0.0
13337,,,,,,,,,4,48,0.00,True,False,3-5 years,False,False,0.0
15074,,,,,,,,,2,47,0.00,True,False,1-2 years,False,False,0.0
19724,,,,,,,,,5,28,0.00,True,False,5-10 years,True,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,,,,,,,,,7,29,12.50,True,True,2-3 years,False,False,0.0
11964,,,,,,,,,33,75,0.00,True,False,> 10 years,True,False,0.0
5390,,,,,,,,,16,56,0.00,True,False,5-10 years,False,False,0.0
860,,,,,,,,,16,37,37.00,True,True,3-5 years,True,False,0.0


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [14]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    result = [] 
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!