In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
def lgb_drop_features(features):
    # id and label (not features)
    unused_feature_list = ['price']

    # Hurts performance
    unused_feature_list += ['lat', 'long']

    return features.drop(unused_feature_list, axis=1, errors='ignore')

# Drop useless feature
lgb_features = lgb_drop_features(df)
print("Number of features for LGBM: {}".format(len(lgb_features.columns)))
lgb_features.head()

In [None]:
# Prepare training and cross-validation data
lgb_label = df.price.astype(np.float32)

# Transform to Numpy matrices
lgb_X = lgb_features
lgb_y = pd.DataFrame(lgb_label)

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(lgb_X, lgb_y, test_size=0.1)

# Remove outlier examples from X_train and y_train; Keep them in X_val and y_val for proper cross-validation
#outlier_threshold = 0.4
#mask = (abs(y_train) <= outlier_threshold)
#X_train = X_train[mask, :]
#y_train = y_train[mask]

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

In [None]:
# Specify feature names and categorical features for LightGBM
feature_names = [s for s in lgb_features.columns]
categorical_features = ['status', 'add_attr', 'state', 'year', 'fireplace', 'parking', "subtype", 'sewer', 'water', 'app', 'heating',
       'cooling', 'materials', 'roof', 'foundation', 'interior']

categorical_indices = []
for i, n in enumerate(lgb_features.columns):
    if n in categorical_features:
        categorical_indices.append(i)
print(categorical_indices)

In [None]:
# LightGBM parameters
params = {}

params['objective'] = 'regression'
params['metric'] = 'mae'
params['num_threads'] = 4                   # set to number of real CPU cores for best performance

params['boosting_type'] = 'gbdt'
params['num_boost_round'] = 2000
params['learning_rate'] = 0.003             # shrinkage_rate
params['early_stopping_rounds'] = 30        # Early stopping based on validation set performance

# Control tree growing
params['num_leaves'] = 127                  # max number of leaves in one tree (default 31)
params['min_data'] = 150                    # min_data_in_leaf
params['min_hessian'] = 0.001               # min_sum_hessian_in_leaf (default 1e-3)
params['max_depth'] = -1                    # limit the max depth of tree model, default -1 (no limit)
params['max_bin'] = 255                     # max number of bins that feature values are bucketed in (small -> less over fitting, default 255)
params['sub_feature'] = 0.5                 # feature_fraction (small values => use very different sub models)

# Row sub_sampling (speed up training and alleviate over fitting)
params['bagging_fraction'] = 0.7
params['bagging_freq'] = 50                 # perform bagging at every k iteration

# Constraints on categorical features
params['min_data_per_group'] = 100          # minimal number of data per categorical group (default 100)
params['cat_smooth'] = 15.0                 # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params['lambda_l1'] = 0.0
params['lambda_l2'] = 0.0

# Random seeds (keep default values)
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [None]:
# Single LightGBM 
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
lgb_valid_set = lgb.Dataset(X_val, label=y_val, feature_name=feature_names)

np.random.seed(42)
random.seed(36)
model = lgb.train(params, lgb_train_set, verbose_eval=False,
                valid_sets=[lgb_train_set, lgb_valid_set], valid_names=['train', 'val'],
                categorical_feature=categorical_indices)

# Evaluate on train and validation sets
print("Train score: {}".format(abs(model.predict(X_train).reshape(-1,1) - y_train).mean()))
print("Val score: {}".format(abs(model.predict(X_val).reshape(-1,1) - y_val).mean()))

In [None]:
# Plot LightGBM feature importance
lgb.plot_importance(model, height=0.8, figsize=(12.5, 12.5), ignore_zero=False);

In [None]:
# Ensemble 20x LightGBM
lgb_train_set = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
print("lgb_X: {}".format(X_train.shape))
print("lgb_y: {}".format(y_train.shape))

#del params['early_stopping_rounds']
#del params['feature_fraction_seed']
#del params['bagging_seed']
params['num_boost_round'] = 2500

# Train multiple models
bags = 20
models = []
for i in range(bags):
    print("Start training model {}".format(i))
    params['seed'] = i
    np.random.seed(42)
    random.seed(10)
    model = lgb.train(params, lgb_train_set, verbose_eval=False, categorical_feature=categorical_indices)
    models.append(model)
    
# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val).reshape(-1,1) - y_val).mean()))

# Save the trained models to disk
# save_models(models)

# models = load_models(['checkpoints/lgb_' + str(i) for i in range(5)])  # load pretrained models

In [None]:
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val).reshape(-1,1) - y_val).mean()))

In [None]:
def predict_and_export(models, features, file_name):
    # Construct DataFrame for prediction results
    lgb = pd.DataFrame()

    lgb['Index'] = features.index
    
    test_features = lgb_drop_features(features)
    
    pred = []
    for i, model in enumerate(models):
        print("Start model {}".format(i))
        pred.append(model.predict(test_features))
    
    # Take average across all models
    mean_pred = np.mean(pred, axis=0)
    
    lgb['price'] = [float(format(x, '.4f')) for x in mean_pred]    
    
    print("Length of submission DataFrame: {}".format(len(lgb)))
    print("Submission header:")
    #lgb.to_csv(file_name, index=False)
    return lgb, pred  # Return the results so that we can analyze or sanity check it

In [None]:
file_name = "/Users/charles/Desktop/DSAI/6_DataScience/project/DSP/20221/data/final_lgbm_ensemble_x20-2500.csv"
lgb, pred = predict_and_export(models, X_val, file_name)
lgb.head()