In [1]:
import pandas as pd
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df_test.loc[:, "target"] = -1
df_whole = pd.concat([df_train, df_test], ignore_index=True)
print(df_train.shape)
features = [f for f in df_train.columns if f not in ["id", "target"]]
cat_features = [f for f in features if f.startswith("cat") == True]
cont_features = [f for f in features if f.startswith("cat") == False]
print(features, cat_features, cont_features, len(features))

test_ids = df_test.loc[:,"id"]

(300000, 32)
['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10'] ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18'] ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10'] 30


In [2]:

# for submission
df_whole.loc[:, cat_features] = df_whole.loc[:, cat_features].astype('category')      
print(df_whole[features[2]])

0         A
1         A
2         A
3         A
4         G
         ..
499995    A
499996    A
499997    D
499998    A
499999    A
Name: cat2, Length: 500000, dtype: category
Categories (19, object): ['A', 'B', 'C', 'D', ..., 'Q', 'R', 'S', 'U']


In [3]:
df_train = df_whole.loc[df_whole.loc[:, "target"] != -1, :]
df_test = df_whole.loc[df_whole.loc[:, "target"] == -1, :]
print(df_train.shape, df_test.shape)

(300000, 32) (200000, 32)


In [4]:
from sklearn import metrics
def auc_train_test(model, x_train, x_test, y_train, y_test, early_stopping_rounds=None):
    if early_stopping_rounds == None: 
        model.fit(x_train, y_train)
    else:
        model.fit(x_train, y_train,
              eval_set=(x_test, y_test),
              early_stopping_rounds=early_stopping_rounds,
              verbose=200)
    preds = model.predict_proba(x_test)[:, 1]
    return metrics.roc_auc_score(y_test, preds)

In [10]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np

# Functions for KFold evaluation and prediction
def create(hyperparams):
    """Create LGBM Classifier for a given set of hyper-parameters."""
    model = LGBMClassifier(**hyperparams)
    return model

def fit(model, X, y):
    """Simple training of a given model."""
    model.fit(X, y)
    return model

def fit_with_stop(model, X, y, X_val, y_val, esr):
    """Advanced training with early stopping."""
    model.fit(X, y,
              eval_set=(X_val, y_val),
              early_stopping_rounds=esr,
              verbose=200)
    return model

def evaluate(model, X, y):
    """Compute AUC for a given model."""
    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

def single_evaluation(X, y, hyperparams, esr=100):
    """ Simple split"""
    scores = []
    X_train, X_val, y_train, y_val = train_test_split(X , y, test_size = 0.028059109276941666 , random_state = 42)

    model = create(hyperparams)
    model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)
    train_score = evaluate(model, X_train, y_train)
    val_score = evaluate(model, X_val, y_val)
    print(f"Eval AUC: {val_score}")
    scores.append((train_score, val_score))
    scores = pd.DataFrame(scores, columns=['train score', 'validation score'])
    return scores

def kfold_evaluation(X, y, k, hyperparams, esr=100):
    """Run a KFlod evaluation."""
    scores = []
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = KFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"\n----- FOLD {i} -----")
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)
        train_score = evaluate(model, X_train, y_train)
        val_score = evaluate(model, X_val, y_val)
        scores.append((train_score, val_score))
        
        print(f"Fold {i} | Eval AUC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns=['train score', 'validation score'])
    
    return scores

def kfold_prediction(X, y, X_test, k, hyperparams, esr=100):
    """Make predictions with a bagged model based on KFold."""
    yp = np.zeros(len(X_test))
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = KFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"\n----- FOLD {i} -----")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)
        yp += model.predict_proba(X_test)[:, 1] / k
    
    return yp

In [6]:
from sklearn.model_selection import train_test_split

#  for training
# x_train, x_test, y_train, y_test = train_test_split(df_train.loc[:, features], df_train.loc[:, "target"], random_state=10, stratify=df_train.loc[:, "target"])

# for submission
x_train, x_test, y_train = df_train.loc[:, features], df_test.loc[:, features], df_train.loc[:, "target"]

print(x_train.shape, x_test.shape)
print(y_train.shape)
# print(y_train.shape, y_test.shape)

(300000, 30) (200000, 30)
(300000,)


In [6]:
from category_encoders import TargetEncoder

def encode(enc, x_train, y_train, x_test):
    x_train_transformed = enc.fit_transform(x_train, y_train)
    x_test_transformed = enc.transform(x_test)
    return x_train_transformed, x_test_transformed

te = TargetEncoder(cols=cat_features)
x1, x2 = encode(te, x_train.loc[:, cat_features], y_train, x_test.loc[:, cat_features])
print(x1.shape, x2.shape)

x_train = x_train.join(x1, lsuffix="", rsuffix="_enc")
x_test = x_test.join(x2, lsuffix="", rsuffix="_enc")
print(x_train.shape, x_test.shape)

  elif pd.api.types.is_categorical(cols):


(300000, 19) (200000, 19)
(300000, 49) (200000, 49)


In [7]:
y_train = y_train.astype('category')
# y_test = y_test.astype('category')

In [8]:
y_train.dtype

CategoricalDtype(categories=[0, 1], ordered=False)

In [11]:
# for submission
K =10
BEST_PARAMS = {'n_estimators': 10000,
 'learning_rate': 0.05,
 'metric': 'auc',
 'num_leaves': 237,
 'max_depth': 31,
 'reg_alpha': 4.571551679011291,
 'reg_lambda': 12.577178152686312,
 'colsample_bytree': 0.2301028578381579,
 'subsample': 0.9390970911797094,
 'cat_smooth': 47.58241398941176
              }
# from lightgbm import LGBMClassifier
# clf = LGBMClassifier(**BEST_PARAMS,n_jobs= -1)
# clf.fit(x_train, y_train)
kfold_prediction(x_train, y_train, x_test, K, BEST_PARAMS, 500)


------ 10-fold evaluation -----
{'n_estimators': 10000, 'learning_rate': 0.05, 'metric': 'auc', 'num_leaves': 237, 'max_depth': 31, 'reg_alpha': 4.571551679011291, 'reg_lambda': 12.577178152686312, 'colsample_bytree': 0.2301028578381579, 'subsample': 0.9390970911797094, 'cat_smooth': 47.58241398941176}

----- FOLD 0 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.896294
[400]	valid_0's auc: 0.896997
[600]	valid_0's auc: 0.896781
[800]	valid_0's auc: 0.896385
Early stopping, best iteration is:
[392]	valid_0's auc: 0.897034

----- FOLD 1 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.896731
[400]	valid_0's auc: 0.897918
[600]	valid_0's auc: 0.897807
[800]	valid_0's auc: 0.897403
Early stopping, best iteration is:
[468]	valid_0's auc: 0.897999

----- FOLD 2 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.896685
[400]	valid_0's auc: 0.897874
[600]	valid_0's auc: 0.897932
[800]	valid_0's auc: 0.89746
[1000]	valid_0's auc: 0.896913
Early stopping, best iteration is:
[542]	valid_0's auc: 0.898078

----- FOLD 3 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.895142
[400]	valid_0's auc: 0.897054
[600]	valid_0's auc: 0.8975
[800]	valid_0's auc: 0.897579
[1000]	valid_0's auc: 0.897427
[1200]	valid_0's auc: 0.897125
Early stopping, best iteration is:
[878]	valid_0's auc: 0.897628

----- FOLD 4 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.8978
[400]	valid_0's auc: 0.899139
[600]	valid_0's auc: 0.899125
[800]	valid_0's auc: 0.898646
Early stopping, best iteration is:
[462]	valid_0's auc: 0.899286

----- FOLD 5 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.895412
[400]	valid_0's auc: 0.896735
[600]	valid_0's auc: 0.896682
[800]	valid_0's auc: 0.896384
Early stopping, best iteration is:
[489]	valid_0's auc: 0.896749

----- FOLD 6 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.893099
[400]	valid_0's auc: 0.894654
[600]	valid_0's auc: 0.894668
[800]	valid_0's auc: 0.894456
Early stopping, best iteration is:
[439]	valid_0's auc: 0.89476

----- FOLD 7 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.894966
[400]	valid_0's auc: 0.896051
[600]	valid_0's auc: 0.895623
[800]	valid_0's auc: 0.895191
Early stopping, best iteration is:
[317]	valid_0's auc: 0.896071

----- FOLD 8 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.897141
[400]	valid_0's auc: 0.898075
[600]	valid_0's auc: 0.897992
[800]	valid_0's auc: 0.897573
Early stopping, best iteration is:
[413]	valid_0's auc: 0.898137

----- FOLD 9 -----




Training until validation scores don't improve for 500 rounds
[200]	valid_0's auc: 0.89326
[400]	valid_0's auc: 0.894502


KeyboardInterrupt: 

In [31]:
# x_test.loc[:, "target"] = clf.predict_proba(x_test)[:,1]
# x_test = x_test.reset_index()
# x_test["id"] = test_ids
x_test.loc[:, ["id", "target"]].to_csv("./output.csv", index=False)
x_test.loc[:, ["id", "target"]]

Unnamed: 0,id,target
0,5,0.023433
1,6,0.120639
2,8,0.000996
3,9,0.198303
4,11,0.040009
...,...,...
199995,499983,0.981807
199996,499984,0.011967
199997,499987,0.610144
199998,499994,0.221175
