# First Model

In this notebook, we create a simple model using LightGBM. The features included in this model are:
- all float (or int but not category) variables as it is:
    - `RevLineCr', 'NoEmp', 'CreateJob', `RetainedJob`, `ApprovalFY`. `DisbursementGross`, `GrAppv`, `SBA_Appv`
- some categorical variables as it is:
    - `NewExist`, `RevLineCr`, `LowDoc`, `UrbanRural`
- Some date objects as daystamp:
    - `DisbursementDate_daystamp`, `ApprovalDate_daystamp`
- Some categorical varibles with coarse labeling:
    - `FranchiseCode`(0,1,or others)
- Some categorical variables with holdout target encoding:
    - `Sector`, `State`, `BankState`

Note that `City` is not used in this model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import  KFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import lightgbm as lgb

In [2]:
# read data
data = pd.read_csv(f"/Users/yuke/Documents/Codes/DSC/DSC_STUDY/FinDataChallenge/takazawa/data/train.csv", index_col=0)

In [3]:
# convert data as in eda.ipynb
# import addfips
# af = addfips.AddFIPS()

categorical_cols = ['FranchiseCode','RevLineCr', 'LowDoc', 'Sector', 'UrbanRural', 'NewExist']
date_cols = ["DisbursementDate", "ApprovalDate"]
dollar_cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
for col in categorical_cols:
    data[col] = data[col].astype('category')
    if data[col].isnull().sum():
        data[col] = data[col].cat.add_categories("NAN").fillna("NAN")
for col in date_cols:
    data[col] = pd.to_datetime(data[col], format="%d-%b-%y")
    # add date cols
    data[col + "_year"] = pd.DatetimeIndex(data[col]).year
    data[col + "_month"] = pd.DatetimeIndex(data[col]).month
    data[col + "_day"] = pd.DatetimeIndex(data[col]).day
    data[col + "_daystamp"] = (data[col] - data[col].min()).dt.days
for col in dollar_cols:
    data[col] = data[col].str.replace("[$,]", "", regex=True)
    data[col] = data[col].astype(float)

## I want to run Codes below but currently not possible due to access limit??
# all_states = data['State'].to_numpy()
# all_state_fips = [af.get_state_fips(item) for item in all_states]
# data['State_FIPS'] = all_state_fips
# county_fips = [county_FIPS(item['City'], item['State'], item['State_FIPS']) for i, item in data.iterrows()]
# data['County_FIPS'] = county_fips

In [4]:
data.dtypes

Term                                  int64
NoEmp                                 int64
NewExist                           category
CreateJob                             int64
RetainedJob                           int64
FranchiseCode                      category
RevLineCr                          category
LowDoc                             category
DisbursementDate             datetime64[ns]
MIS_Status                            int64
Sector                             category
ApprovalDate                 datetime64[ns]
ApprovalFY                            int64
City                                 object
State                                object
BankState                            object
DisbursementGross                   float64
GrAppv                              float64
SBA_Appv                            float64
UrbanRural                         category
DisbursementDate_year               float64
DisbursementDate_month              float64
DisbursementDate_day            

In [5]:
num_cols = ['NoEmp', 'CreateJob', 'RetainedJob', 'ApprovalFY', 'DisbursementGross', 'GrAppv', 'SBA_Appv']
retained_cat_cols = ['NewExist', 'RevLineCr', 'LowDoc', 'UrbanRural']
timestamp_cols = ['DisbursementDate_daystamp', 'ApprovalDate_daystamp']

# coarse franchise col
data['FranchiseCode1'] = (data['FranchiseCode']==1).astype("category")
data['FranchiseCode0'] = (data['FranchiseCode']==1).astype("category")
franchise_cols = ['FranchiseCode1', 'FranchiseCode0']

In [6]:
# devide into training data and test data
X = data.drop("MIS_Status", axis=1)
y = data["MIS_Status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# target encoding: function
def Holdout_target_encoding(X,y, column,folds):
  df = X; df['target']=y
  df[column + "_target"] = 0.9
  tmp = df[[column, column + "_target"]]
  for idx1, idx2 in folds:
    train = df.iloc[idx1]
    #val = df.iloc[idx2]
    mean = train.groupby(column)['target'].mean()
    for ind, v in tmp.iloc[idx2].iterrows():
      try:
        tmp.loc[ind,column+"_target"] = mean.loc[tmp.loc[ind, column]]
      except:
        continue
  df[column+ "_target"] = tmp[column + "_target"]
  return df

In [8]:
def target_encode_test(train_X, train_y, test_X, column):
    df = train_X; df['target'] = train_y
    mean = train_X.groupby(column)['target'].mean()
    test_X[column + "_target"] = 0.9
    for ind in mean.index:
        test_X.loc[test_X[column] == ind, column + "_target"] = mean[ind]
    return test_X

In [9]:
target_encode_cols = ['Sector', 'State', 'BankState']
kf = KFold(n_splits=3, shuffle=True, random_state=1000)
# for train
kf_iter_train = kf.split(X_train)
folds_train = []
for train_idx, test_idx in kf.split(X_train):
    folds_train.append((train_idx, test_idx))
for col in target_encode_cols:
    X_train = Holdout_target_encoding(X_train, y_train, col, folds_train)
# for validation, we use target encoding of train data
for col in target_encode_cols:
    X_test = target_encode_test(X_train, y_train, X_test, col)
target_encoded_cols = [item + "_target" for item in target_encode_cols]

In [10]:
all_cols = num_cols + retained_cat_cols + timestamp_cols + franchise_cols + target_encoded_cols
X_train[all_cols].head()

Unnamed: 0,NoEmp,CreateJob,RetainedJob,ApprovalFY,DisbursementGross,GrAppv,SBA_Appv,NewExist,RevLineCr,LowDoc,UrbanRural,DisbursementDate_daystamp,ApprovalDate_daystamp,FranchiseCode1,FranchiseCode0,Sector_target,State_target,BankState_target
6863,2,0,0,2000,75000.0,75000.0,63750.0,1.0,N,N,1,11713.0,9505,False,False,0.83703,0.900193,0.873469
30454,1,0,0,1998,286000.0,286000.0,286000.0,2.0,0,N,0,9765.0,8829,False,False,0.882296,0.902727,0.920869
8111,1,0,10,1998,50000.0,50000.0,25000.0,1.0,Y,N,1,13391.0,9112,True,True,0.912308,0.942085,0.916484
22811,5,0,0,1995,4000.0,4000.0,3400.0,1.0,0,A,0,7907.0,7947,False,False,0.914838,0.885714,0.895833
4428,3,0,0,2006,40000.0,40000.0,32000.0,1.0,N,N,0,10951.0,11990,False,False,0.83703,0.914692,0.928109


In [12]:
def Macrof1(preds, eval_dataset):
    y_true = eval_dataset.get_label()
    max_score =0
    for th in np.linspace(0.2,0.8,100):
        y_pred = (preds>th).astype(int)
        score = f1_score(y_true, y_pred, average='macro')
        if score > max_score:
            max_score = score
    return 'Macrof1', max_score, True

## LightGBM

In [16]:
params = {
    'objective': 'binary',
    'metric': 'custom',  # Use custom to use the custom metric for evaluation
    'verbose': 1,
}
dataset = lgb.Dataset(X_train[all_cols], label=y_train)

# Define CV parameters
cv_results = lgb.cv(
    params,
    dataset,
    num_boost_round=100,
    nfold=5,
    feval=Macrof1,  # Custom evaluation function
    stratified=False,
    seed=42,
)

[LightGBM] [Info] Number of positive: 24180, number of negative: 2896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1979
[LightGBM] [Info] Number of data points in the train set: 27076, number of used features: 18
[LightGBM] [Info] Number of positive: 24128, number of negative: 2948
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1979
[LightGBM] [Info] Number of data points in the train set: 27076, number of used features: 18
[LightGBM] [Info] Number of positive: 24157, number of negative: 2919
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

In [17]:
cv_results['valid Macrof1-mean']

[0.48340676471921273,
 0.6347922857667522,
 0.657907361365361,
 0.663279370177643,
 0.6678094320684936,
 0.6703029499443361,
 0.6740964130539522,
 0.6752182841035284,
 0.6774336874989972,
 0.6789209054920011,
 0.6793722321639505,
 0.6792990579960657,
 0.6797773149964115,
 0.6796230703692886,
 0.6801409767793365,
 0.6799662030853384,
 0.6795524784640141,
 0.680139851873448,
 0.6802651968861071,
 0.6804714004001899,
 0.6799431230187538,
 0.681775136335997,
 0.6809518479435022,
 0.6814970977222371,
 0.6808936977464322,
 0.681270395361399,
 0.6818047682612522,
 0.6819041499388069,
 0.6814258122952377,
 0.6818156894374673,
 0.6818081082210478,
 0.6817388947275177,
 0.6818871277927446,
 0.6816150687842804,
 0.6814471090228624,
 0.6813564850263518,
 0.6812452984724476,
 0.681144054822511,
 0.680883354069725,
 0.6807624000704533,
 0.6809891537336681,
 0.6812490928265155,
 0.6813494943804924,
 0.6815507747609711,
 0.6815302425096121,
 0.6817154570413881,
 0.6814861677997024,
 0.6809991073117599

## Training with all data

In [18]:
# target encode
target_encode_cols = ['Sector', 'State', 'BankState']
kf = KFold(n_splits=3, shuffle=True, random_state=100)
folds = []
for train_idx, test_idx in kf.split(X):
    folds.append((train_idx, test_idx))
for col in target_encode_cols:
    data = Holdout_target_encoding(data,data['MIS_Status'],col,folds)

In [19]:
data

Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,...,ApprovalDate_year,ApprovalDate_month,ApprovalDate_day,ApprovalDate_daystamp,FranchiseCode1,FranchiseCode0,target,Sector_target,State_target,BankState_target
0,163,21,1.0,0,0,1,N,N,1998-01-31,1,...,2006,9,22,12028,True,True,1,0.940968,0.926978,0.938422
1,84,6,1.0,4,0,0,0,N,1993-10-31,1,...,1992,6,30,6831,False,False,1,0.900000,0.904899,0.913333
2,242,45,1.0,4,90,0,N,N,2001-08-31,1,...,2001,4,18,10045,False,False,1,0.897122,0.957386,0.970803
3,237,4,1.0,0,0,0,N,N,2007-08-31,1,...,2003,10,6,10946,False,False,1,0.913703,0.930355,0.940112
4,184,0,1.0,0,0,0,N,N,1983-06-08,1,...,1999,12,17,9557,False,False,1,0.941393,0.880574,0.877026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,283,14,1.0,0,0,1,N,N,1998-01-31,1,...,1995,3,2,7806,True,True,1,0.938835,0.908805,0.887340
42303,53,2,1.0,0,0,0,Y,N,1991-04-03,1,...,2007,6,6,12285,False,False,1,0.897089,0.879886,0.943820
42304,59,6,2.0,0,0,1,N,N,2003-02-28,1,...,2003,3,14,10740,True,True,1,0.897122,0.894410,0.875810
42305,295,18,1.0,0,8,0,N,N,1997-12-10,1,...,1989,8,23,5789,False,False,1,0.897122,0.807867,0.785448


In [20]:
params = {
    'objective': 'binary',
    'metric': 'custom',  # Use custom to use the custom metric for evaluation
    'verbose': 1,
}
dataset = lgb.Dataset(data[all_cols], label=data['MIS_Status'])

# Define CV parameters
cv_results = lgb.cv(
    params,
    dataset,
    num_boost_round=100,
    nfold=5,
    feval=Macrof1,  # Custom evaluation function
    stratified=False,
    seed=42,
    return_cvbooster=True,
)

[LightGBM] [Info] Number of positive: 30191, number of negative: 3653
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1996
[LightGBM] [Info] Number of data points in the train set: 33844, number of used features: 18
[LightGBM] [Info] Number of positive: 30222, number of negative: 3622
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1996
[LightGBM] [Info] Number of data points in the train set: 33844, number of used features: 18
[LightGBM] [Info] Number of positive: 30198, number of negative: 3646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

In [21]:
boosters = cv_results['cvbooster'].boosters

In [27]:
cv_results['valid Macrof1-mean']

[0.4725018655215333,
 0.6296934883534203,
 0.651532980113791,
 0.6566476785846621,
 0.660811056414389,
 0.6655018285242693,
 0.667734343423777,
 0.6693059526699345,
 0.6709006014464182,
 0.671596357158458,
 0.6723104315119844,
 0.6738491633556882,
 0.6756646906487813,
 0.6762057254087216,
 0.6763742603154228,
 0.6767526356654326,
 0.6766995147951306,
 0.6764432105052126,
 0.6764541314771227,
 0.676625999210114,
 0.677019637522396,
 0.6770817503982327,
 0.6774673068896613,
 0.6772917164036343,
 0.677396931832283,
 0.6780377997799751,
 0.6784960868906648,
 0.6783303547364513,
 0.678776495973633,
 0.6796009264703519,
 0.6799956642920526,
 0.6792581403938591,
 0.6794007044339184,
 0.6797423333925872,
 0.6795439393303564,
 0.6795660801694208,
 0.679460107635977,
 0.6796503037650506,
 0.6795693540841847,
 0.6794989942289257,
 0.6790097947956402,
 0.6791937717734863,
 0.6792994616322947,
 0.6788675125203301,
 0.6790437805187801,
 0.6789596993755657,
 0.6784230260407738,
 0.6785434027210896,
 

In [22]:
# make encoding for test data
testdata=pd.read_csv("../data/test.csv", index_col=0)
categorical_cols = ['FranchiseCode','RevLineCr', 'LowDoc', 'Sector', 'UrbanRural', 'NewExist']
date_cols = ["DisbursementDate", "ApprovalDate"]
dollar_cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
for col in categorical_cols:
    testdata[col] = testdata[col].astype('category')
    if testdata[col].isnull().sum():
        testdata[col] = testdata[col].cat.add_categories("NAN").fillna("NAN")
for col in date_cols:
    testdata[col] = pd.to_datetime(testdata[col], format="%d-%b-%y")
    # add date cols
    testdata[col + "_year"] = pd.DatetimeIndex(testdata[col]).year
    testdata[col + "_month"] = pd.DatetimeIndex(testdata[col]).month
    testdata[col + "_day"] = pd.DatetimeIndex(testdata[col]).day
    testdata[col + "_daystamp"] = (testdata[col] - testdata[col].min()).dt.days
for col in dollar_cols:
    testdata[col] = testdata[col].str.replace("[$,]", "", regex=True)
    testdata[col] = testdata[col].astype(float)

testdata['FranchiseCode1'] = (testdata['FranchiseCode']==1).astype("category")
testdata['FranchiseCode0'] = (testdata['FranchiseCode']==1).astype("category")


test_X = testdata
for col in target_encode_cols:
    test_X = target_encode_test(data, data['MIS_Status'], test_X, col)

In [24]:
boosters

[<lightgbm.basic.Booster at 0x2bcfd1350>,
 <lightgbm.basic.Booster at 0x2bcf96d50>,
 <lightgbm.basic.Booster at 0x2bcf943d0>,
 <lightgbm.basic.Booster at 0x2bcf5a350>,
 <lightgbm.basic.Booster at 0x2bcf78790>]

In [32]:
def Macrof1_optimal_th(preds,y_true):
    max_score =0
    opt_th = 0
    for th in np.linspace(0.2,0.8,100):
        y_pred = (preds>th).astype(int)
        score = f1_score(y_true, y_pred, average='macro')
        if score > max_score:
            opt_th = th
            max_score = score
    return max_score, opt_th

In [33]:
# for now, let's decide the optimal threshold by all training data
pred_per_cv_train = [item.predict(data[all_cols]) for item in boosters]
pred_average_train = np.mean(pred_per_cv_train, axis=0)
y_true_train = data['MIS_Status']
f1, th = Macrof1_optimal_th(pred_average_train, y_true_train.to_numpy())
print(f1,th)

0.7496270556944924 0.8


In [34]:
pred_per_cv = [item.predict(test_X[all_cols]) for item in boosters]
pred_average = np.mean(pred_per_cv, axis=0)
testdata['predict'] = (pred_average > th).astype(int)

In [35]:
testdata['predict'].to_csv("firstmodel_th.csv", header=False)

In [227]:
print(testdata)

       Term  NoEmp NewExist  CreateJob  RetainedJob FranchiseCode RevLineCr  \
42307     5      2      1.0          1            0             0         T   
42308   235     13      1.0          9           14         77725         Y   
42309    31      5      2.0          0            0             0         N   
42310   120      4      1.0          0            1             0         Y   
42311    63     13      1.0          0            8             1         N   
...     ...    ...      ...        ...          ...           ...       ...   
84610   243     10      1.0          3           14             0         N   
84611   178      0      2.0          0            0             1         N   
84612    42      1      2.0          3            9             0         Y   
84613    76     15      1.0          0            0             0         N   
84614    35      3      2.0          1            4         18150         Y   

      LowDoc DisbursementDate Sector  ... ApprovalD