## Feature Engineering
1. DV encodings for top categorical features as per the baseline XGB model
2. Count encodings for top categorical features as per the baseline XGB model
3. Factorization Machines (FM) to model interactions between sparse features

## Stacking
1. Split the train sample into 2 halves at random
2. Fit a FM model on the first half and predict on the second half and the test sample
3. Fit a XGB/LGB model on the second half with FM prediction as one of the features and predict on the test sample

In [1]:
import pandas as pd
import numpy as np
import os, sys, joblib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import KFold

In [2]:
# GLOBALS
LOCAL_ROOT = '/Users/nathvaru/Documents/personal/AV/janatahack_healthcare_analytics_II/'
DATA_DIR = os.path.join(LOCAL_ROOT, 'data')
TRAIN_FN = os.path.join(DATA_DIR, 'Train_hMYJ020/train.csv')
TEST_FN = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FN = os.path.join(DATA_DIR, 'sample_submission_lfbv3c3.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_FN)
df_test = pd.read_csv(TEST_FN)

In [4]:
print(df_train.shape, df_test.shape)

(318438, 18) (137057, 17)


In [5]:
cat_vars = ['Hospital_code', 'Hospital_type_code',
            'City_Code_Hospital', 'Hospital_region_code',
            'Department', 'Ward_Type', 'Ward_Facility_Code',
            'Bed Grade', 'City_Code_Patient',
            'Type of Admission', 'Severity of Illness', 'Age']
num_vars = ['Available Extra Rooms in Hospital',
            'Visitors with Patient', 'Admission_Deposit']

In [6]:
# Impute missing values in Bed Grade and City_Code_Patient
# with separate category
df_train.fillna({'Bed Grade': 'missing', 'City_Code_Patient': 'missing'},
                inplace=True)
df_test.fillna({'Bed Grade': 'missing', 'City_Code_Patient': 'missing'},
                inplace=True)

In [7]:
# preprocess cat_vars
for var in cat_vars:
    if df_train[var].dtypes == object:
        print(var)
        df_train[var] = df_train[var].apply(
            lambda x: str(x).strip().replace(" ", "-").replace(".", ""))
        df_test[var] = df_test[var].apply(
            lambda x: str(x).strip().replace(" ", "-").replace(".", ""))

Hospital_type_code
Hospital_region_code
Department
Ward_Type
Ward_Facility_Code
Bed Grade
City_Code_Patient
Type of Admission
Severity of Illness
Age


In [8]:
# encode target
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_train['Stay'].values)

df_train['DV'] = le.transform(df_train['Stay'].values)

In [9]:
# split df_train into 2 halves at random

from sklearn.model_selection import train_test_split
df_train1, df_train2 = train_test_split(df_train, test_size=0.5, random_state=2020)

In [10]:
print(df_train1.shape, df_train2.shape)

(159219, 19) (159219, 19)


In [9]:
# FM on df_train1
# prepare data in libsvm format for xlearn library


def convert_to_ffm(df, out_fn, numerics, categories, features, dv_col='DV'):
    
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    
    with open(out_fn, "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[dv_col])) # Set Target Variable here
             
            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if(catdict[x]==0):
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                else:
            
            # For a new field appearing in a training example
                    if(x not in catcodes):
                        catcodes[x] = {}
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
             
            # For already encoded fields
                    elif(datarow[x] not in catcodes[x]):
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
                     
                    code = catcodes[x][datarow[x]]
                    datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

In [10]:
numerics = ['Available Extra Rooms in Hospital', 'Visitors with Patient']
categories = cat_vars
features = numerics + categories

In [11]:
%time convert_to_ffm(df_train, '../data/train_ffm.txt', numerics, categories, features)

CPU times: user 50.2 s, sys: 388 ms, total: 50.6 s
Wall time: 50.7 s


In [None]:
%time convert_to_ffm(df_train1, '../data/train1_ffm.txt', numerics, categories, features)

In [19]:
%time convert_to_ffm(df_train2, '../data/train2_ffm.txt', numerics, categories, features)

CPU times: user 26.3 s, sys: 343 ms, total: 26.6 s
Wall time: 26.8 s


In [21]:
# add DV column to df_test for the sake of data preparation
df_test['DV'] = 1
%time convert_to_ffm(df_test, '../data/test_ffm.txt', numerics, categories, features)

CPU times: user 22.9 s, sys: 270 ms, total: 23.2 s
Wall time: 23.2 s


In [67]:
### train ffm model on df_train1

import xlearn as xl

ffm_model = xl.create_ffm()
ffm_model.setTrain("../data/train1_ffm.txt")
ffm_model.setTXTModel("../model/ffm_train1_model.txt")

param = {'task':'reg', 
         'lr':0.1,
         'lambda':0.05, 
         'metric':'rmse',
         'opt': 'sgd',
         'stop_window': 100,
         'fold': 3}
%time ffm_model.fit(param, '../model/ffm_train1_model.out')

CPU times: user 11.6 s, sys: 61.7 ms, total: 11.7 s
Wall time: 1.8 s


In [12]:
### train ffm model on df_train

import xlearn as xl

ffm_model = xl.create_ffm()
ffm_model.setTrain("../data/train_ffm.txt")
ffm_model.setTXTModel("../model/ffm_train_model.txt")

param = {'task':'reg', 
         'lr':0.1,
         'lambda':0.05, 
         'metric':'rmse',
         'opt': 'sgd',
         'stop_window': 100,
         'fold': 3}
%time ffm_model.fit(param, '../model/ffm_train_model.out')

CPU times: user 28.4 s, sys: 1.13 s, total: 29.5 s
Wall time: 5.74 s


In [68]:
# prediction on df_train2
ffm_model.setTest("../data/train2_ffm.txt") # df_train2
ffm_model.predict('../model/ffm_train1_model.out', '../model/ffm_train2_prediction.txt')

In [13]:
# prediction on df_train
ffm_model.setTest("../data/train_ffm.txt") # df_train2
ffm_model.predict('../model/ffm_train_model.out', '../model/ffm_train_prediction.txt')

In [12]:
# evaluation on train2 data

from sklearn import metrics

true_labels = []
for line in open('../data/train2_ffm.txt', 'r'):
    label = float(line.strip().split(' ')[0])
    true_labels.append(label)
    
predicted_labels = []
for line in open('../model/ffm_train2_prediction.txt', 'r'):
    label = float(line.strip().split(' ')[0])
    predicted_labels.append(label)

assert len(true_labels) == len(predicted_labels)
rmse = np.sqrt(metrics.mean_squared_error(true_labels, predicted_labels))
print('RMSE on train2: %0.4f' %rmse)

RMSE on train2: 2.3386


In [14]:
# evaluation on train data

from sklearn import metrics

true_labels = []
for line in open('../data/train_ffm.txt', 'r'):
    label = float(line.strip().split(' ')[0])
    true_labels.append(label)
    
predicted_labels = []
for line in open('../model/ffm_train_prediction.txt', 'r'):
    label = float(line.strip().split(' ')[0])
    predicted_labels.append(label)

assert len(true_labels) == len(predicted_labels)
rmse = np.sqrt(metrics.mean_squared_error(true_labels, predicted_labels))
print('RMSE on train2: %0.4f' %rmse)

RMSE on train2: 1.9472


In [70]:
# prediction on df_test using model trained on df_train1
ffm_model.setTest("../data/test_ffm.txt")
ffm_model.predict('../model/ffm_train1_model.out', '../model/ffm_test_prediction.txt')

In [15]:
# prediction on df_test using model trained on df_train
ffm_model.setTest("../data/test_ffm.txt")
ffm_model.predict('../model/ffm_train_model.out', '../model/ffm_test_prediction.txt')

### XGB/LGB model on df_train with ffm prediction as one of the features

In [16]:
# add ffm prediction to df_train2 and df_test
tmp = pd.read_csv('../model/ffm_train_prediction.txt', names=['ffm_prediction'])
df_train.reset_index(drop=True, inplace=True)
df_train = pd.concat([df_train, tmp], axis=1)

tmp = pd.read_csv('../model/ffm_test_prediction.txt', names=['ffm_prediction'])
df_test = pd.concat([df_test, tmp], axis=1)

In [17]:
# OHE categorical vars
df_train['sample'] = 'train'
df_test['sample'] = 'test'
cols = ['case_id', 'sample'] + cat_vars
tmp = pd.concat([df_train[cols], df_test[cols]], axis=0)
tmp.reset_index(drop=True, inplace=True)

print('get dummies')
tmp = pd.get_dummies(tmp, prefix=cat_vars, columns=cat_vars,
                     prefix_sep='_', drop_first=True)

mask = tmp['sample'] == 'train'
train = tmp.loc[mask, :]
train.reset_index(drop=True, inplace=True)
train.drop('sample', axis=1, inplace=True)
df_train = pd.merge(df_train[['case_id', 'Stay', 'DV', 'ffm_prediction']+num_vars], train,
                    on='case_id')
del train

mask = tmp['sample'] == 'test'
test = tmp.loc[mask, :]
test.reset_index(drop=True, inplace=True)
test.drop('sample', axis=1, inplace=True)
df_test = pd.merge(df_test[['case_id', 'ffm_prediction']+num_vars], test, on='case_id')
del test
del tmp

get dummies


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [18]:
# select columns and add prefix to all features
FEAT_PREFIX = 'JHA'
cols = list(df_test.columns)
new_cols = [FEAT_PREFIX + '_'+ col.replace(" ", "-")
            if col not in ('case_id', 'Stay', 'DV') else col for col in cols]
rename_dct = dict(zip(cols, new_cols))
df_train.rename(columns=rename_dct, inplace=True)
df_test.rename(columns=rename_dct, inplace=True)

In [19]:
# outlier treatment and scaling for num_vars
from utility import LegacyOutlierScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


PREPROCESS = {
    'exoutscaler': LegacyOutlierScaler(),
    'stdscaler': StandardScaler()
}
STEPS = ['exoutscaler', 'stdscaler']


def preprocess(train, test, steps, features):
    """
    imputation, outlier treatment and scaling
    """
    train = train.copy()
    test = test.copy()
    other_cols = list(set(list(test.columns)) - set(features))
    classic_steps = steps
    steps = list(zip(steps, map(PREPROCESS.get, steps)))
    datapipe = Pipeline(steps=steps)

    x_dev = train[features].values
    
    print('fit')
    datapipe.fit(x_dev)
    
    print('transform dataframe using pipeline')
    print('train data:')
    train1 = datapipe.transform(train[features].values)
    train1 = pd.DataFrame(train1, columns=features)
    train1 = pd.concat([train1, train[other_cols+['DV']]], axis=1)
    print('test data:')
    test1 = datapipe.transform(test[features].values)
    test1 = pd.DataFrame(test1, columns=features)
    test1 = pd.concat([test1, test[other_cols]], axis=1)
    
    # Create "classic" datapipe and store list of features
    classic_pipe = Pipeline([(name, datapipe.named_steps[name])
                             for name in classic_steps])
    classic_pipe.feature_names = features

    return train1, test1, classic_pipe

In [20]:
feats = [col for col in list(df_train.columns) if (col.startswith(FEAT_PREFIX)) and
         (col != 'JHA_ffm_prediction')]
print(feats)
df_train_pre, df_test_pre, pipeline = preprocess(df_train, df_test, STEPS, feats)

['JHA_Available-Extra-Rooms-in-Hospital', 'JHA_Visitors-with-Patient', 'JHA_Admission_Deposit', 'JHA_Hospital_code_2', 'JHA_Hospital_code_3', 'JHA_Hospital_code_4', 'JHA_Hospital_code_5', 'JHA_Hospital_code_6', 'JHA_Hospital_code_7', 'JHA_Hospital_code_8', 'JHA_Hospital_code_9', 'JHA_Hospital_code_10', 'JHA_Hospital_code_11', 'JHA_Hospital_code_12', 'JHA_Hospital_code_13', 'JHA_Hospital_code_14', 'JHA_Hospital_code_15', 'JHA_Hospital_code_16', 'JHA_Hospital_code_17', 'JHA_Hospital_code_18', 'JHA_Hospital_code_19', 'JHA_Hospital_code_20', 'JHA_Hospital_code_21', 'JHA_Hospital_code_22', 'JHA_Hospital_code_23', 'JHA_Hospital_code_24', 'JHA_Hospital_code_25', 'JHA_Hospital_code_26', 'JHA_Hospital_code_27', 'JHA_Hospital_code_28', 'JHA_Hospital_code_29', 'JHA_Hospital_code_30', 'JHA_Hospital_code_31', 'JHA_Hospital_code_32', 'JHA_Hospital_type_code_b', 'JHA_Hospital_type_code_c', 'JHA_Hospital_type_code_d', 'JHA_Hospital_type_code_e', 'JHA_Hospital_type_code_f', 'JHA_Hospital_type_code_g', 

In [28]:
# modelling
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
import operator


def create_feature_map(features):
    outfile = open('../model/xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "multi:softmax"
    params["num_class"] = 11
    params['eval_metric'] = "merror"
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('../model/xgbmodel.txt', '../model/xgb.fmap',
                         with_stats=True)
        importance = model.get_fscore(fmap='../model/xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest,
                                ntree_limit=model.best_ntree_limit)
    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.accuracy_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "multiclass"
    params["num_class"] = 11
    params["metric"] = "multi_error"
    params["verbose"] = -1
    params["seed"] = seed_val
    params["max_depth"] = dep
    params["num_leaves"] = 50
    params["min_data_in_leaf"] = 20
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X,
                                num_iteration=model.best_iteration)
    pred_test_y = pred_test_y.argmax(axis=1)
    
    if test_X2 is not None:
        pred_test_y2 = model.predict(test_X2,
                                     num_iteration=model.best_iteration)
        pred_test_y2 = pred_test_y2.argmax(axis=1)
    else:
        pred_test_y2 = None
        
    loss = 0
    if test_y is not None:
        loss = metrics.accuracy_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2

In [29]:
# Model building

def trainModel(train_X, train_y, test_X, n_splits, model_name, feats, 
               **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = []
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        
        print('Accuracy: ', acc)
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        if pred_test is not None:
            pred_test_full.append(pred_test)

    acc = metrics.accuracy_score(train_y, pred_val_full)
    return pred_val_full, acc, pred_test_full, cv_scores

In [30]:
feat_cols = [x for x in list(df_train_pre.columns) if x.startswith(FEAT_PREFIX)]
print(len(feat_cols), 'JHA_ffm_prediction' in feat_cols)
x_train = df_train_pre[feat_cols]
y_train = df_train_pre['DV']
x_test = df_test_pre[feat_cols]

121 True


In [31]:
# LGB
params = {'rounds': 600, 'depth': 7, 'eta': 0.05}
%time pred_val_full, acc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "LGB", feat_cols, **params)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's multi_error: 0.589377
[40]	valid_0's multi_error: 0.582735
[60]	valid_0's multi_error: 0.580201
[80]	valid_0's multi_error: 0.577657
[100]	valid_0's multi_error: 0.576112
[120]	valid_0's multi_error: 0.5755
[140]	valid_0's multi_error: 0.575076
[160]	valid_0's multi_error: 0.575245
[180]	valid_0's multi_error: 0.574925
[200]	valid_0's multi_error: 0.574454
[220]	valid_0's multi_error: 0.574388
[240]	valid_0's multi_error: 0.573964
[260]	valid_0's multi_error: 0.573653


[280]	valid_0's multi_error: 0.574124
[300]	valid_0's multi_error: 0.574209
[320]	valid_0's multi_error: 0.573776
[340]	valid_0's multi_error: 0.574294
Early stopping, best iteration is:
[257]	valid_0's multi_error: 0.573531
Accuracy:  0.4264692027961487
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's multi_error: 0.59063
[40]	valid_0's multi_error: 0.583084
[60]	valid_0's multi_error: 0.580493
[80]	valid_0's multi_error: 0.578825
[100]	valid_0's multi_error: 0.577978
[120]	valid_0's multi_error: 0.577101
[140]	valid_0's multi_error: 0.577026


[160]	valid_0's multi_error: 0.576998
[180]	valid_0's multi_error: 0.577309
[200]	valid_0's multi_error: 0.576696
[220]	valid_0's multi_error: 0.576244
[240]	valid_0's multi_error: 0.576319
[260]	valid_0's multi_error: 0.575914


[280]	valid_0's multi_error: 0.575396
[300]	valid_0's multi_error: 0.575575
[320]	valid_0's multi_error: 0.575792
[340]	valid_0's multi_error: 0.5755
[360]	valid_0's multi_error: 0.575688
[380]	valid_0's multi_error: 0.575387
[400]	valid_0's multi_error: 0.575217
[420]	valid_0's multi_error: 0.575566
[440]	valid_0's multi_error: 0.575858


[460]	valid_0's multi_error: 0.576272
[480]	valid_0's multi_error: 0.57598
[500]	valid_0's multi_error: 0.575999
Early stopping, best iteration is:
[407]	valid_0's multi_error: 0.574935
Accuracy:  0.4250654758540124
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's multi_error: 0.590809
[40]	valid_0's multi_error: 0.582697
[60]	valid_0's multi_error: 0.582255
[80]	valid_0's multi_error: 0.580578
[100]	valid_0's multi_error: 0.579278
[120]	valid_0's multi_error: 0.577996
[140]	valid_0's multi_error: 0.577912
[160]	valid_0's multi_error: 0.577205
[180]	valid_0's multi_error: 0.577139


[200]	valid_0's multi_error: 0.576913
[220]	valid_0's multi_error: 0.577262
[240]	valid_0's multi_error: 0.577214
[260]	valid_0's multi_error: 0.577196
[280]	valid_0's multi_error: 0.57696
[300]	valid_0's multi_error: 0.576809
[320]	valid_0's multi_error: 0.577148
[340]	valid_0's multi_error: 0.577506


[360]	valid_0's multi_error: 0.576904
[380]	valid_0's multi_error: 0.577554
Early stopping, best iteration is:
[296]	valid_0's multi_error: 0.576395
Accuracy:  0.42360522299474307
CPU times: user 22min 12s, sys: 11.3 s, total: 22min 23s
Wall time: 22min 33s


In [32]:
print('CV accuracy: ', acc)
print(cv_scores)

CV accuracy:  0.4250466338816347
[0.4264692027961487, 0.4250654758540124, 0.42360522299474307]


In [33]:
# max voting with preference for the first model when predictions are equally frequent
from scipy.stats import mode

pred_test_full1 = np.array(pred_test_full).T
pred_test_full1 = mode(pred_test_full1, 1)[0]
pred_test_full1 = pred_test_full1.reshape((pred_test_full1.shape[0], ))
print(pred_test_full1)

[0 5 2 ... 2 1 5]


In [34]:
# transform pred_test_full into original labels
out_df = pd.DataFrame({"case_id": df_test_pre["case_id"].values})
out_df["Stay"] = le.inverse_transform(pred_test_full1)
out_df.to_csv("../model/Mffm_LGB_out.csv", index=False)