In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as gbm

# Load data 

In [2]:
train_path = "../data/local_train.csv"
test_path = "../data/local_test.csv"

In [3]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)


In [4]:
len(df_train)

180000

In [5]:
df_train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,4700,B,F,A,B,E,BQ,A,N,BL,...,0.236103,0.31648,0.600188,0.192628,0.423268,0.176169,0.288662,0.793823,0.315937,0
1,453468,A,F,G,C,G,BI,A,U,AE,...,0.627595,0.657871,0.285636,0.73543,0.624774,0.624786,0.702093,0.305428,0.509672,0
2,369751,A,K,G,A,D,BU,A,E,Y,...,0.568753,0.790244,0.829496,0.015555,0.201568,0.786024,0.385816,0.431418,0.953504,0
3,69873,B,I,A,A,E,BI,A,Y,AU,...,0.775825,0.822142,0.796378,0.680048,0.246919,0.877006,0.611704,0.839066,0.934136,0
4,170717,A,M,A,B,F,BI,I,AH,BM,...,0.256564,0.5794,0.291327,0.737406,0.579729,0.290138,0.351962,0.269911,0.308573,1


In [6]:
df_train.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'target'],
      dtype='object')

# Make category label integer 

In [7]:
cat_col = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18']
cont_col = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']

In [8]:
for col in cat_col:
    encoder = LabelEncoder()
    # encoder.fit(list(df_train[col]) + list(df_test[col]))
    encoder.fit(list(df_train[col].values) + list(df_test[col].values))
    df_train[col] = encoder.transform(df_train[col].values)
    df_test[col] = encoder.transform(df_test[col].values)
    print("done ", col)

done  cat0
done  cat1
done  cat2
done  cat3
done  cat4
done  cat5
done  cat6
done  cat7
done  cat8
done  cat9
done  cat10
done  cat11
done  cat12
done  cat13
done  cat14
done  cat15
done  cat16
done  cat17
done  cat18


In [9]:
X = df_train.drop(["id", "target"], axis=1)
y = df_train["target"]


In [10]:
X.shape

(180000, 30)

In [11]:
y.shape

(180000,)

# split training and validation 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

In [13]:
train_data = gbm.Dataset(X_train, label=y_train, categorical_feature=cat_col)

In [14]:
validation_data = gbm.Dataset(X_test, label=y_test, reference=train_data)

In [15]:
lgbm_params = {
    'boosting': 'gbdt',
    'application': 'binary',
    'learning_rate': 0.005,
    'metric': 'binary_logloss',
    'lambda_l1': 4e-05, 
    'lambda_l2': 1.35e-08, 
    'num_leaves': 50, 
    'feature_fraction': 0.7 ,
    'bagging_fraction': 0.4 ,
    'bagging_freq': 5, 
    'max_depth':50,
    'min_child_samples': 100
}


In [16]:
num_round = 3000
bst = gbm.train(lgbm_params, train_data, num_round, valid_sets=[validation_data], early_stopping_rounds=10, verbose_eval=200)

New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


[LightGBM] [Info] Number of positive: 43102, number of negative: 118898
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3390
[LightGBM] [Info] Number of data points in the train set: 162000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.266062 -> initscore=-1.014697
[LightGBM] [Info] Start training from score -1.014697
Training until validation scores don't improve for 10 rounds




[200]	valid_0's binary_logloss: 0.412918
[400]	valid_0's binary_logloss: 0.374727
[600]	valid_0's binary_logloss: 0.362295
[800]	valid_0's binary_logloss: 0.356812
[1000]	valid_0's binary_logloss: 0.354004
[1200]	valid_0's binary_logloss: 0.352227
[1400]	valid_0's binary_logloss: 0.351115
[1600]	valid_0's binary_logloss: 0.350246
Early stopping, best iteration is:
[1634]	valid_0's binary_logloss: 0.35014


# evaluation

In [17]:
local_flag = False
if "target" in df_test.columns:
    df_test2 = df_test.drop(["id", "target"], axis=1)
    local_flag = True
else:
    df_test2 = df_test.drop(["id"], axis=1)

In [18]:
# for col in cat_col:
#     df_test2[col] = df_test2[col].apply(hash_cat_two)
#     print("done ", col)

In [19]:
# ?bst

In [20]:
df_test2.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,0,1,0,1,7,33,6,31,0,0,...,0.832036,0.936027,0.277879,0.677056,0.556303,0.861982,0.880029,0.700449,0.408714,0.457853
1,0,8,0,0,3,2,12,8,12,0,...,0.461403,0.465581,0.773081,0.257367,0.1188,0.076394,0.617704,0.786615,0.463401,0.53498
2,0,5,0,0,4,33,0,32,60,0,...,0.727531,0.799496,0.792766,0.574415,0.240725,0.263741,1.005591,0.710779,0.878274,0.967304
3,1,8,5,1,5,33,0,14,45,0,...,0.294513,0.301259,0.340367,0.38537,0.55585,0.486549,0.338415,0.392193,0.255491,0.3984
4,0,10,0,0,4,33,2,26,38,5,...,0.354699,0.317978,0.590793,0.207941,0.734811,0.569722,0.569788,0.477121,0.530924,0.349163


In [21]:
pred = bst.predict(df_test2, num_iteration=bst.best_iteration)

In [22]:
df_test2

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,0,1,0,1,7,33,6,31,0,0,...,0.832036,0.936027,0.277879,0.677056,0.556303,0.861982,0.880029,0.700449,0.408714,0.457853
1,0,8,0,0,3,2,12,8,12,0,...,0.461403,0.465581,0.773081,0.257367,0.118800,0.076394,0.617704,0.786615,0.463401,0.534980
2,0,5,0,0,4,33,0,32,60,0,...,0.727531,0.799496,0.792766,0.574415,0.240725,0.263741,1.005591,0.710779,0.878274,0.967304
3,1,8,5,1,5,33,0,14,45,0,...,0.294513,0.301259,0.340367,0.385370,0.555850,0.486549,0.338415,0.392193,0.255491,0.398400
4,0,10,0,0,4,33,2,26,38,5,...,0.354699,0.317978,0.590793,0.207941,0.734811,0.569722,0.569788,0.477121,0.530924,0.349163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,1,8,0,0,5,33,4,40,35,0,...,0.460675,0.690114,0.556406,0.778571,0.280025,0.544267,0.524250,0.502416,0.782653,0.683392
119996,1,10,0,4,7,33,0,30,38,0,...,0.702119,0.698665,0.771236,0.806725,0.556306,0.339435,0.740020,0.420333,0.957234,0.777523
119997,0,10,0,0,4,33,2,39,38,7,...,0.325691,0.245823,0.243893,0.808406,0.273086,0.577464,0.283429,0.347557,0.254384,0.269322
119998,0,7,0,0,5,45,7,28,3,5,...,0.621029,0.940450,0.082610,0.238615,0.274609,0.425721,0.321446,0.323698,0.525235,0.796209


In [23]:
df_pred_final = df_test[["id"]]

In [24]:
df_pred_final["target"] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred_final["target"] = pred


In [25]:
df_pred_final.head()

Unnamed: 0,id,target
0,99402,0.962721
1,92026,0.029618
2,102280,0.05097
3,293468,0.048771
4,56800,0.149017


In [26]:
if local_flag==False:
    df_pred_final.to_csv('submission.csv', index=False)
else:
    from sklearn.metrics import roc_auc_score
    score = roc_auc_score(df_test["target"], pred)
    print(score)

0.8918187129018053


In [27]:
# y_test_label = df_test["target"]

In [28]:
# roc_auc_score(y_test_label, pred)