In [1]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [3]:
cv_only = True
save_cv = True
full_train = False

In [4]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [5]:
path = "../input/"
NROWS = 10000

In [6]:
train = pd.read_csv(path+'train.csv', nrows=NROWS)
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv', nrows=NROWS)
test_id = test['id']

In [7]:
print(train.shape)
train.head()

(10000, 59)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0


In [8]:
print(test.shape)
test.head()

(10000, 58)


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,0,0,0,0,0,12,1,0,0,0.5,0.3,0.610328,7,1,-1,0,-1,1,1,1,2,1,65,1,0.316228,0.669556,0.352136,3.464102,0.1,0.8,0.6,1,1,6,3,6,2,9,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,0,0,0,0,0,5,1,0,0,0.9,0.5,0.771362,4,1,-1,0,0,11,1,1,0,1,103,1,0.316228,0.60632,0.358329,2.828427,0.4,0.5,0.4,3,3,8,4,10,2,7,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,0,0,0,0,0,10,0,0,0,0.4,0.0,0.916174,11,1,-1,0,-1,14,1,1,2,1,29,3,0.4,0.896239,0.398497,3.316625,0.6,0.6,0.6,2,3,7,4,6,3,12,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,0,0,0,0,0,4,1,0,0,0.1,0.2,-1.0,7,1,-1,0,-1,1,1,1,2,1,40,2,0.374166,0.65211,0.381445,2.44949,0.1,0.5,0.5,2,1,7,3,12,1,13,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,0,0,0,0,0,4,1,0,0,0.9,0.4,0.817771,11,1,-1,0,-1,11,1,1,2,1,101,3,0.374166,0.812914,0.385097,3.316625,0.9,0.6,0.8,3,4,7,1,10,4,12,4,0,0,4,0,1,1,0,0,1


In [9]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

In [10]:
y = train['target'].values
drop_feature = [
    'id',
    'target'
]

In [11]:
X = train.drop(drop_feature,axis=1)
feature_names = X.columns.tolist()
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]
print(len(cat_features))
print(len(num_features))

14
23


In [12]:
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [13]:
num_features

['ps_ind_01',
 'ps_ind_03',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15']

# feature engineering

In [14]:
# 각 샘플마다 missing값 몇 개인지 missing column에 float 형태로 적어둔다
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [15]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,missing
0,7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1,1.0
1,9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0,2.0
2,13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0,3.0
3,16,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0,0.0
4,17,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0,2.0


In [16]:
# train과 test에서 cat column을 label encoding("string" -> 숫자)한다
# factorize도 같은 역할 한다
for c in cat_features:
    le = LabelEncoder()
    # train만으로 fit한다
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [17]:
# label encoding으로 column 수 달라진 것 없다
print(train.shape)
train.head()

(10000, 60)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,missing
0,7,0,2,2,5,2,1,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,11,1,0,0,2,4,2,0,1,1,11,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1,1.0
1,9,0,1,1,7,1,1,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,12,1,0,0,0,11,2,1,3,1,18,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0,2.0
2,13,0,5,4,9,2,1,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,8,1,0,0,0,14,2,1,3,1,59,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0,3.0
3,16,0,0,1,2,1,1,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,8,1,1,0,2,11,2,1,4,1,103,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0,0.0
4,17,0,0,2,0,2,1,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,12,1,0,0,0,14,2,1,3,1,81,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0,2.0


In [18]:
enc = OneHotEncoder()
enc.fit(train[cat_features])
'''
X_cat, X_t_cat은 아래와 같은 형태가 된다
array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
       '''
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [19]:
X_cat

<10000x183 sparse matrix of type '<class 'numpy.float64'>'
	with 140000 stored elements in Compressed Sparse Row format>

In [20]:
X_cat.toarray

<bound method _cs_matrix.toarray of <10000x183 sparse matrix of type '<class 'numpy.float64'>'
	with 140000 stored elements in Compressed Sparse Row format>>

In [21]:
ind_features = [c for c in feature_names if 'ind' in c]
count=0
# ind feature들을 하나로 묶어서 새로운 feature를 만들어낸 것
for c in ind_features:
    # count는 do while 사용할 목적으로 썻네
    if count==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [22]:
# new_ind column에 7천여개의 서로 다른 값이 생겼다 생겼다
train['new_ind'].unique().shape[0]

7010

In [23]:
# "new_ind"를 frquency encoding 하는 과정
cat_count_features = []
for c in cat_features+['new_ind']:
    # pd.concat 할 때 ()안에 []안에 series 들어가야 하는 형태. 하나의 컬럼으로 합쳐진다. 그 후 dict로 변환
    # c column의 값과 개수를 딕셔너리화 하고, 
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    # %s에 c값이 들어간다
    # get(x,0)은 x에 해당하는 값을 찾고, 없으면 0을 return한다
    # c_count 컬럼에에 각각의 값에 해당하는 횟수를 입력한다. 없으면 0을 넣는다
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [24]:
# 아래는 "train[num_features+cat_count_features].values"로 얻은 np.array와 X_cat으로 얻은 np.array가 들어가 있는 list
train_list = [train[num_features+cat_count_features].values,X_cat,]
test_list = [test[num_features+cat_count_features].values,X_t_cat,]

In [25]:
print(train_list[0].shape)
print(train_list[1].shape)

(10000, 39)
(10000, 183)


In [26]:
# 이게 결국 train 시킬 최종 데이터다 (39 + 183)
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()
print(X.shape)
print(X_test.shape)

(10000, 222)
(10000, 222)


# Model Development

In [27]:
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [28]:
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

In [29]:
for s in range(16):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    # seed 값을 0~15로 다양하게 한다
    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []
        
        # train_fold랑 validate는 index
        for i, (train_fold, validate) in enumerate(kf):
            # label_train 은 y_train을 의미하고 이를 np.array형태로 반환됨
            X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            # xgb.DMatrix와 동일한 의미
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
#             print("###############", bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[validate] += bst.predict(X_validate)

            score = Gini(label_validate, cv_train[validate])
#             print("**************", s, i, score)
            fold_scores.append(score)
        
        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print (Gini(train_label, cv_train))
        print ("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.247265	valid_0's gini: -0.0761586
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.247265	valid_0's gini: -0.0761586
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.247397	valid_0's gini: -0.00436317
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.247397	valid_0's gini: -0.00436317
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.247397	valid_0's gini: -0.0709596
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.247397	valid_0's gini: -0.0709596
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.247397	valid_0's gini: -0.0313492
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.247397	valid_0's gini: -0.0313492
Training until validation scores don't improve for 100 rounds.
[100]	v

In [30]:
# print(X_train.shape)
# print(X_validate.shape)
# print(label_train.shape)
# print(label_validate.shape)
# print(type(dtrain))

# uniqueVals = np.unique(train_fold)
# my_list = uniqueVals.tolist()
# # print(my_list)
# print(type(train_fold))

# uniqueVals_val = np.unique(validate)
# my_list_val = uniqueVals_val.tolist()
# # print(my_list_val)

In [31]:
print(x_score)
#pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('../model/lgbm3_pred_avg.csv', index=False)
#pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('../model/lgbm3_cv_avg.csv', index=False)

[-6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05, -6.609332761807546e-05]


In [32]:
label_train

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
10      0
11      0
13      0
14      0
15      0
16      0
17      0
18      0
19      1
20      0
21      0
22      0
23      0
26      0
27      0
28      1
29      0
30      0
31      0
32      0
35      0
       ..
9968    0
9969    0
9970    0
9972    0
9973    0
9974    0
9975    0
9976    0
9977    0
9978    0
9979    0
9980    0
9981    0
9982    0
9983    0
9984    0
9985    0
9986    0
9988    1
9989    0
9990    0
9991    0
9992    0
9993    0
9994    0
9995    0
9996    0
9997    0
9998    0
9999    0
Name: target, Length: 8001, dtype: int64