In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

ks = pd.read_csv('kickstarter-projects/ks-projects-201801.csv',
                parse_dates=['deadline', 'launched'])

ks = ks.query('state != "live"')

ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

ks = ks.assign(hour=ks.launched.dt.hour,
              day=ks.launched.dt.day,
              month=ks.launched.dt.month,
              year=ks.launched.dt.year)

cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

In [7]:
import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)
    
    train = dataframe[:-2 * valid_size]
    valid = dataframe[-2 * valid_size:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

In [15]:
def train_model(train, valid, test):
    feature_cols = train.columns.drop('outcome')
    
    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
    dtest = lgb.Dataset(test[feature_cols], label=test['outcome'])
    
    param = {'num_leaves': 64, 'objective': 'binary',
            'metric': 'auc', 'seed': 7}
    print('Training model:')
    bst = lgb.train(param, dtrain, num_boost_round=1000, 
                    valid_sets=[dtrain, dvalid], 
                    early_stopping_rounds=10, verbose_eval=True)
    test_pred = bst.predict(test[feature_cols])
    test_score = metrics.roc_auc_score(test['outcome'], test_pred)
    print(f'Test AUC score: {test_score:.4f}')
    return bst

In [16]:
train, valid, test = get_data_splits(baseline_data)
bst = train_model(train, valid, test)

Training model:
[1]	training's auc: 0.694777	valid_1's auc: 0.694205
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.699026	valid_1's auc: 0.697948
[3]	training's auc: 0.702097	valid_1's auc: 0.700725
[4]	training's auc: 0.703634	valid_1's auc: 0.702387
[5]	training's auc: 0.707438	valid_1's auc: 0.706414
[6]	training's auc: 0.709243	valid_1's auc: 0.708354
[7]	training's auc: 0.71018	valid_1's auc: 0.709729
[8]	training's auc: 0.71222	valid_1's auc: 0.711345
[9]	training's auc: 0.712973	valid_1's auc: 0.711938
[10]	training's auc: 0.714221	valid_1's auc: 0.71335
[11]	training's auc: 0.715826	valid_1's auc: 0.714963
[12]	training's auc: 0.71729	valid_1's auc: 0.716481
[13]	training's auc: 0.717821	valid_1's auc: 0.71696
[14]	training's auc: 0.718463	valid_1's auc: 0.717515
[15]	training's auc: 0.720301	valid_1's auc: 0.719399
[16]	training's auc: 0.721847	valid_1's auc: 0.721109
[17]	training's auc: 0.723055	valid_1's auc: 0.722065
[18]	training's au

In [18]:
import category_encoders as ce

cat_features = ['category', 'currency', 'country']

count_encoder = ce.CountEncoder()
count_encoded = count_encoder.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix('_count'))

train, valid, test = get_data_splits(data)
bst = train_model(train, valid, test)

Training model:
[1]	training's auc: 0.697099	valid_1's auc: 0.696439
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.70748	valid_1's auc: 0.707034
[3]	training's auc: 0.708725	valid_1's auc: 0.707767
[4]	training's auc: 0.71245	valid_1's auc: 0.71121
[5]	training's auc: 0.713234	valid_1's auc: 0.711858
[6]	training's auc: 0.714457	valid_1's auc: 0.712962
[7]	training's auc: 0.71541	valid_1's auc: 0.714001
[8]	training's auc: 0.716166	valid_1's auc: 0.714467
[9]	training's auc: 0.718451	valid_1's auc: 0.716918
[10]	training's auc: 0.719776	valid_1's auc: 0.7183
[11]	training's auc: 0.721037	valid_1's auc: 0.719431
[12]	training's auc: 0.721716	valid_1's auc: 0.7202
[13]	training's auc: 0.72303	valid_1's auc: 0.721552
[14]	training's auc: 0.724129	valid_1's auc: 0.722621
[15]	training's auc: 0.725216	valid_1's auc: 0.723768
[16]	training's auc: 0.726487	valid_1's auc: 0.724775
[17]	training's auc: 0.728333	valid_1's auc: 0.726604
[18]	training's auc: 0

In [19]:
import category_encoders as ce

cat_features = ['category', 'currency', 'country']

target_encoder = ce.TargetEncoder(cols=cat_features)

train, valid, test = get_data_splits(data)

target_encoder.fit(train[cat_features], train['outcome'])

train = train.join(target_encoder.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_encoder.transform(valid[cat_features]).add_suffix('_target'))
test = test.join(target_encoder.transform(test[cat_features]).add_suffix('_target'))

bst = train_model(train, valid, test)

Training model:
[1]	training's auc: 0.723747	valid_1's auc: 0.72316
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.725827	valid_1's auc: 0.724821
[3]	training's auc: 0.726884	valid_1's auc: 0.726296
[4]	training's auc: 0.728588	valid_1's auc: 0.727967
[5]	training's auc: 0.730598	valid_1's auc: 0.729873
[6]	training's auc: 0.731701	valid_1's auc: 0.730965
[7]	training's auc: 0.732682	valid_1's auc: 0.732077
[8]	training's auc: 0.733402	valid_1's auc: 0.732662
[9]	training's auc: 0.734159	valid_1's auc: 0.733121
[10]	training's auc: 0.734763	valid_1's auc: 0.733554
[11]	training's auc: 0.735371	valid_1's auc: 0.734073
[12]	training's auc: 0.735979	valid_1's auc: 0.734595
[13]	training's auc: 0.736412	valid_1's auc: 0.735006
[14]	training's auc: 0.73693	valid_1's auc: 0.735429
[15]	training's auc: 0.737367	valid_1's auc: 0.735866
[16]	training's auc: 0.737856	valid_1's auc: 0.73631
[17]	training's auc: 0.738513	valid_1's auc: 0.736969
[18]	training's 

In [20]:
import category_encoders as ce

cat_features = ['category', 'currency', 'country']
cat_boost_encoder = ce.CatBoostEncoder(cols=cat_features)

train, valid, test = get_data_splits(data)
cat_boost_encoder.fit(train[cat_features], train['outcome'])

train = train.join(cat_boost_encoder.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(cat_boost_encoder.transform(valid[cat_features]).add_suffix('_cb'))
test = test.join(cat_boost_encoder.transform(test[cat_features]).add_suffix('_cb'))

bst = train_model(train, valid, test)

Training model:
[1]	training's auc: 0.723747	valid_1's auc: 0.72316
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.725828	valid_1's auc: 0.72482
[3]	training's auc: 0.726902	valid_1's auc: 0.726187
[4]	training's auc: 0.728548	valid_1's auc: 0.727834
[5]	training's auc: 0.730517	valid_1's auc: 0.729886
[6]	training's auc: 0.731763	valid_1's auc: 0.731118
[7]	training's auc: 0.732744	valid_1's auc: 0.7322
[8]	training's auc: 0.733407	valid_1's auc: 0.732649
[9]	training's auc: 0.734148	valid_1's auc: 0.733183
[10]	training's auc: 0.734678	valid_1's auc: 0.733519
[11]	training's auc: 0.73529	valid_1's auc: 0.734039
[12]	training's auc: 0.735717	valid_1's auc: 0.73429
[13]	training's auc: 0.736247	valid_1's auc: 0.73485
[14]	training's auc: 0.736765	valid_1's auc: 0.735347
[15]	training's auc: 0.737168	valid_1's auc: 0.735593
[16]	training's auc: 0.737737	valid_1's auc: 0.736169
[17]	training's auc: 0.738461	valid_1's auc: 0.736853
[18]	training's auc: