In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
link_to_train = "https://raw.githubusercontent.com/thimmie52/data_org_financial_pred/refs/heads/main/data/Train.csv"
link_to_test = "https://raw.githubusercontent.com/thimmie52/data_org_financial_pred/refs/heads/main/data/Test.csv"

In [3]:
df_train = pd.read_csv(link_to_train)
df_test = pd.read_csv(link_to_test)

In [4]:
df_train['where'] = 'train'
df_test['where'] = 'test'

In [5]:
df = pd.concat([df_train, df_test], axis=0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12023 entries, 0 to 2404
Data columns (total 40 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   ID                                                                12023 non-null  object 
 1   country                                                           12023 non-null  object 
 2   owner_age                                                         12022 non-null  float64
 3   attitude_stable_business_environment                              12021 non-null  object 
 4   attitude_worried_shutdown                                         12021 non-null  object 
 5   compliance_income_tax                                             12019 non-null  object 
 6   perception_insurance_doesnt_cover_losses                          12016 non-null  object 
 7   perception_cannot_afford_insurance   

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2405 entries, 0 to 2404
Data columns (total 39 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   ID                                                                2405 non-null   object 
 1   country                                                           2405 non-null   object 
 2   owner_age                                                         2404 non-null   float64
 3   attitude_stable_business_environment                              2405 non-null   object 
 4   attitude_worried_shutdown                                         2405 non-null   object 
 5   compliance_income_tax                                             2405 non-null   object 
 6   perception_insurance_doesnt_cover_losses                          2403 non-null   object 
 7   perception_cannot_afford_insuranc

In [8]:
def calculate_null_percentage(df):
  null_data = []
  for col in df.columns:
    null_count = df[col].isnull().sum()
    null_data.append({'column': col, 'null_count': null_count, 'length': df.shape[0], 'percentage_null': null_count/df.shape[0]*100})

  null_perception = pd.DataFrame(null_data)
  return null_perception.sort_values(by='percentage_null', ascending=False)

In [60]:
null_perception_df = calculate_null_percentage(df)
null_perception_train = calculate_null_percentage(df_train)
null_perception_test = calculate_null_percentage(df_test)

In [10]:
null_perception_df.head(10)

Unnamed: 0,column,null_count,length,percentage_null
37,uses_informal_lender,5620,12023,46.743741
36,uses_friends_family_savings,5619,12023,46.735424
35,motivation_make_more_money,5375,12023,44.70598
33,medical_insurance,5230,12023,43.499958
34,funeral_insurance,5230,12023,43.499958
32,business_age_months,5144,12023,42.784663
31,future_risk_theft_stock,5117,12023,42.560093
29,has_internet_banking,4995,12023,41.545371
30,has_debit_card,4995,12023,41.545371
28,has_loan_account,4991,12023,41.512102


In [11]:
null_perception_train.head(10)

Unnamed: 0,column,null_count,length,percentage_null
37,uses_informal_lender,4489,9618,46.672905
36,uses_friends_family_savings,4488,9618,46.662508
35,motivation_make_more_money,4291,9618,44.614265
33,medical_insurance,4188,9618,43.543356
34,funeral_insurance,4188,9618,43.543356
32,business_age_months,4111,9618,42.742774
31,future_risk_theft_stock,4100,9618,42.628405
29,has_internet_banking,4003,9618,41.619879
30,has_debit_card,4003,9618,41.619879
28,has_loan_account,3999,9618,41.578291


In [12]:
df['Target'].head()

Unnamed: 0,Target
0,Low
1,Medium
2,Low
3,Low
4,Low


In [13]:
null_columns = null_perception_df[null_perception_df['percentage_null'] > 30].column.to_list()

In [14]:
df_train['Target'].value_counts()

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
Low,6280
Medium,2868
High,470


In [15]:
for i in null_columns:
  print(f"-----------{i}-----------------")
  print(pd.crosstab(df[i], df['Target'], normalize=False))
  print('\n')

-----------uses_informal_lender-----------------
Target                           High   Low  Medium
uses_informal_lender                               
Don't know                          1     1       0
Don’t know (Do not show)            0     2       2
Have now                           18    81     115
Never had                         256  1643     965
Used to have but don't have now     4  1087     647
Used to have but don’t have now    63   124     120


-----------uses_friends_family_savings-----------------
Target                           High   Low  Medium
uses_friends_family_savings                        
Don't know                          1     1       0
Don’t know (Do not show)            0     1       2
Have now                           41   147     341
Never had                         250  1669     976
Used to have but don't have now     1   974     421
Used to have but don’t have now    49   146     110


-----------motivation_make_more_money-----------------
Targ

In [16]:
missing_ratio = df.isna().mean().sort_values(ascending=False)
missing_ratio

Unnamed: 0,0
uses_informal_lender,0.467437
uses_friends_family_savings,0.467354
motivation_make_more_money,0.44706
medical_insurance,0.435
funeral_insurance,0.435
business_age_months,0.427847
future_risk_theft_stock,0.425601
has_internet_banking,0.415454
has_debit_card,0.415454
has_loan_account,0.415121


In [17]:
df

Unnamed: 0,ID,country,owner_age,attitude_stable_business_environment,attitude_worried_shutdown,compliance_income_tax,perception_insurance_doesnt_cover_losses,perception_cannot_afford_insurance,personal_income,business_expenses,...,has_debit_card,future_risk_theft_stock,business_age_months,medical_insurance,funeral_insurance,motivation_make_more_money,uses_friends_family_savings,uses_informal_lender,Target,where
0,ID_3CFL0U,eswatini,63.0,Yes,No,No,No,Yes,3000.0,6000.0,...,Never had,,6.0,Never had,Used to have but don’t have now,,Never had,Never had,Low,train
1,ID_XWI7G3,zimbabwe,39.0,No,Yes,Yes,No,Yes,,,...,,No,3.0,Never had,Never had,,,,Medium,train
2,ID_TY93LV,malawi,34.0,Don’t know or N/A,No,No,Don't know,Yes,30000.0,6000.0,...,Never had,Yes,,,,Yes,,,Low,train
3,ID_9OP2C8,malawi,28.0,Yes,No,No,No,No,180000.0,60000.0,...,Never had,No,,,,Yes,Never had,Have now,Low,train
4,ID_13REYS,zimbabwe,43.0,Yes,No,No,Yes,Yes,50.0,2400.0,...,,No,0.0,Never had,Never had,Yes,,,Low,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2400,ID_FX7XJZ,eswatini,29.0,Yes,Yes,No,No,Yes,600.0,1700.0,...,Never had,,11.0,Never had,Never had,,Never had,Never had,,test
2401,ID_XAL1LX,malawi,20.0,Don’t know or N/A,Don’t know or N/A,No,Don't know,Don't know,30000.0,20000.0,...,Never had,No,4.0,,,Yes,,,,test
2402,ID_UHBP0F,zimbabwe,26.0,Yes,Yes,No,Yes,Yes,3888.0,,...,,No,0.0,Have now,Have now,,,,,test
2403,ID_GKIKR2,eswatini,63.0,No,No,Yes,Yes,Yes,3500.0,1700.0,...,Never had,,4.0,Never had,Have now,,Never had,Never had,,test


In [49]:
X = df_train.drop("Target", axis=1)
y = df_train["Target"]

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns

In [50]:
from sklearn.preprocessing import LabelEncoder

X_encoded = X.copy()
y_encoded = y.copy()

for col in cat_cols:
    le = LabelEncoder()
    X_encoded[col] = X_encoded[col].astype(str)
    X_encoded[col] = le.fit_transform(X_encoded[col])

le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y_encoded)

In [51]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split



X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42
)



In [52]:
import lightgbm as lgb

X_lgb = X_encoded.copy()   # contains NaNs
y_lgb = y_encoded.copy()

In [53]:
train_data = lgb.Dataset(X_lgb, label=y_lgb)

In [54]:
num_classes = len(np.unique(y_encoded))

In [55]:
params = {
    "objective": "multiclass",
    "num_class": num_classes,# use "regression" if regression
    "metric": "multi_logloss",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=300
)

In [56]:
importance = pd.Series(
    model.feature_importance(importance_type="gain"),
    index=X_lgb.columns
).sort_values(ascending=False)

importance.head(20)

Unnamed: 0,0
funeral_insurance,41643.653568
has_credit_card,11486.11307
has_loan_account,11236.450628
has_insurance,9534.031054
uses_friends_family_savings,5353.747177
ID,5152.324158
business_expenses,4922.716257
business_turnover,4814.581132
has_debit_card,4270.444323
personal_income,4230.318759


In [62]:
feature_importance = pd.DataFrame({ 'Column_name': importance.index, 'feature_importance': importance.values})

In [63]:
feature_importance["null_percentage"] = feature_importance["Column_name"].apply(lambda x: df_train[x].isnull().sum()/df_train.shape[0]*100)

In [65]:
feature_importance

Unnamed: 0,Column_name,feature_importance,null_percentage
0,funeral_insurance,41643.653568,43.543356
1,has_credit_card,11486.11307,20.243294
2,has_loan_account,11236.450628,41.578291
3,has_insurance,9534.031054,20.336868
4,uses_friends_family_savings,5353.747177,46.662508
5,ID,5152.324158,0.0
6,business_expenses,4922.716257,2.380952
7,business_turnover,4814.581132,2.245789
8,has_debit_card,4270.444323,41.619879
9,personal_income,4230.318759,1.133292


Important nuance:

High null + LOW importance → drop

High null + VERY HIGH importance → keep

So:

❌ `uses_informal_lender`

❌ `motivation_make_more_money`

❌ `future_risk_theft_stock`

❌ `has_internet_banking`

❌ `business_age_months` (redundant)