In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, auc
import lightgbm as lgb
from functools import partial
import copy
import time
import matplotlib.pyplot as plt
import scipy as sp
import random
from hyperopt import hp, tpe, Trials, fmin, space_eval
import cv2
from catboost import CatBoost
from catboost import Pool
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv("../../../20200125atma/input/train.csv")
test = pd.read_csv("../../../20200125atma/input/test.csv")
userlog = pd.read_csv("../../../20200125atma/input/user_log.csv")
poi = pd.read_csv("../../../20200125atma/input/poi.csv")

# FE

In [3]:
list_y = []
for i in range(13411):
    img = cv2.imread('../../../20200125atma/input/images/'+str(i)+'.png')
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)  # HSV 色空間に変換

    red = cv2.inRange(hsv, np.array([145, 70, 0]), np.array([180, 255, 255]))
    yellow = cv2.inRange(hsv, (15,0,0), (36, 255, 255))
    green = cv2.inRange(hsv, np.array([30, 190, 0]), np.array([90, 255, 255]))
    blue = cv2.inRange(hsv, np.array([108, 121, 0]), np.array([120, 255, 255]))
    white = cv2.inRange(hsv, np.array([108, 21, 0]), np.array([255, 70, 255]))

    # 白だけゴミがあるので、収縮演算
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    white = cv2.erode(yellow, kernel)

    bin_imgs = {'red': red, 'yellow': yellow, 'green': green,
            'blue': blue, 'white': white}

    for label, bin_img in bin_imgs.items():
        contours, _ = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        contours = list(filter(lambda cnt: len(cnt) > 30, contours))
        count = len(contours)
    
        if label == "yellow":
            list_y.append(count)

In [4]:
def preprocess(train, test, userlog, poi):
    userlog["sysname"][userlog.sysname == "ANDROID"] = "Android"
    userlog["lang"][userlog.lang == "ja_JP"] = "ja_JP"
    poi = poi.rename(columns={"latitude": "store_lat", "longitude": "store_lon"})
    train = pd.merge(train, poi, on ="pid", how = "left")
    test = pd.merge(test, poi, on ="pid", how = "left")
    userlog = pd.merge(userlog, train[["session_id", "store_lat", "store_lon", "radius", "pid"]], on="session_id", how="left")
    userlog = pd.merge(userlog, test[["session_id", "store_lat", "store_lon", "radius", "pid"]], on="session_id", how="left")
    userlog["store_lat"] = np.nanmax(userlog[["store_lat_x", "store_lat_y"]], axis=1)
    userlog["store_lon"] = np.nanmax(userlog[["store_lon_x", "store_lon_y"]], axis=1)
    userlog["radius"] = np.nanmax(userlog[["radius_x", "radius_y"]], axis=1)
    userlog["pid"] = np.nanmax(userlog[["pid_x", "pid_y"]], axis=1)
    drop_features = ["store_lat_x", "store_lat_y", "store_lon_x", "store_lon_y", "radius_x", "radius_y", "pid_x", "pid_y"]
    userlog.drop(drop_features, axis=1, inplace=True)
    userlog["distance"] = np.sqrt((userlog["latitude"]- userlog["store_lat"])**2 + (userlog["longitude"]-userlog["store_lon"])** 2 )
    userlog["time"] = userlog["hour"].map(str) + str(":") + userlog["minute"].map(str) + str(":") + userlog["second"].map(str)
    userlog["time"] = pd.to_datetime(userlog['time'],format= '%H:%M:%S' )
    userlog = userlog.sort_values(["session_id", "hour", "minute", "second"]).reset_index(drop=True)
    userlog["virtual_dis"] = 6370 * np.arccos(np.sin(userlog["latitude"])*np.sin(userlog["store_lat"]) + np.cos(userlog["latitude"])*np.cos(userlog["store_lat"])*np.cos(userlog["longitude"]-userlog["store_lon"]))
    userlog["in_store"] = userlog["virtual_dis"] < userlog["radius"]
    unique_sess = pd.DataFrame(userlog.groupby("pid")["session_id"].nunique().copy().reset_index(drop=False))
    unique_sess = unique_sess.rename(columns = {"session_id": "uni_sess_per_pid"})
    userlog = pd.merge(userlog, unique_sess, on ="pid", how = "left")
    return train, test, userlog, poi, unique_sess
train, test, userlog, poi, unique_sess = preprocess(train, test, userlog, poi)

In [5]:
def encode(train, test, userlog):
    os_list =  sorted(list(set(userlog['sysname'].unique())))
    os_map = dict(zip(os_list, np.arange(len(os_list))))
    userlog["sysname"] = userlog["sysname"].map(os_map)
    
    lang_list =  sorted(list(set(userlog['lang'].unique())))
    lang_map = dict(zip(lang_list, np.arange(len(lang_list))))
    userlog["lang"] = userlog["lang"].map(lang_map)
    
    timezone_list =  sorted(list(set(userlog['timezone'].unique())))
    timezone_map = dict(zip(timezone_list, np.arange(len(timezone_list))))
    userlog["timezone"] = userlog["timezone"].map(timezone_map)

    return train, test, userlog
train, test, userlog = encode(train, test, userlog)

In [88]:
train[train.session_id == "0003f26df5d8b928b416fba58efd5c91"]

Unnamed: 0,target,imid,pid,session_id,store_lat,store_lon,radius,type,name
6051,0,6051,354655367,0003f26df5d8b928b416fba58efd5c91,35.651739,46.776134,118,4,4


In [84]:
#userlog[userlog.session_id == "0003f26df5d8b928b416fba58efd5c91"]["pid"].value_counts()

In [6]:
userlog.head(3)

Unnamed: 0,latitude,longitude,sysname,optout,lang,timezone,session_id,hour,minute,second,day_of_week,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,categorical_6,store_lat,store_lon,radius,pid,distance,time,virtual_dis,in_store,uni_sess_per_pid
0,35.6503,46.780509,0,0,9,1,0003f26df5d8b928b416fba58efd5c91,0,33,31,6,4,0,44,1.0,1,187,35.651739,46.776134,118.0,354655367.0,0.004606,1900-01-01 00:33:31,15.744307,True,16
1,35.650298,46.780512,0,0,9,1,0003f26df5d8b928b416fba58efd5c91,0,33,31,6,4,1,44,1.0,1,187,35.651739,46.776134,118.0,354655367.0,0.004609,1900-01-01 00:33:31,15.758882,True,16
2,35.650286,46.780524,0,0,9,1,0003f26df5d8b928b416fba58efd5c91,2,18,33,6,4,2,44,1.0,1,187,35.651739,46.776134,118.0,354655367.0,0.004624,1900-01-01 02:18:33,15.832117,True,16


In [89]:
def get_logdata(user_sample):
    day_of_week_counts = {"day"+str(day) : 0 for day in range(7)}

    all_results = []
    features = {"accumulated_actions":0}
    features["accumulated_actions"] = user_sample.shape[0]
    features["session_id"] = user_sample.iloc[0]["session_id"]
    features["OS"] = user_sample.iloc[0]["sysname"]
    features["lang"] = user_sample.iloc[0]["lang"]
    features["timezone"] = user_sample.iloc[0]["timezone"]
    features["optout_count"] = np.sum(user_sample["optout"])
    distance = np.array(user_sample["distance"])
    features["max_dist"] = np.nanmax(distance)
    features["min_dist"] = np.nanmin(distance)
    features["std_dist"] = np.nanstd(distance)
    features["closest_lat"] = user_sample[user_sample.distance == features["min_dist"]].iloc[0]["latitude"]
    features["closest_lon"] = user_sample[user_sample.distance == features["min_dist"]].iloc[0]["longitude"]
    features["categorical_1"] = user_sample["categorical_1"].nunique()
    features["categorical_2"] = user_sample["categorical_2"].nunique()
    features["categorical_3"] = user_sample["categorical_3"].nunique()
    features["categorical_4"] = user_sample["categorical_4"].nunique()
    features["categorical_5"] = user_sample["categorical_5"].nunique()
    features["categorical_6"] = user_sample["categorical_6"].nunique()
    #features["cont_hour"] = np.max(user_sample["hour"]) - np.min(user_sample["hour"])
    #features["closest_min"] = user_sample[user_sample.distance == features["min_dist"]].iloc[0]["minute"]
    #features["closest_second"] = user_sample[user_sample.distance == features["min_dist"]].iloc[0]["second"]
    #instore_queue = np.array2string(np.array(user_sample["in_store"].astype(int)), separator=',')
    #features["exit_count"] = instore_queue[1:-1].replace(',', '').count("10")
    #second_min_index = user_sample["distance"].nsmallest(2).index[-1]
    #features["2ndmin_dist"] = user_sample.loc[second_min_index]["distance"] #直接indexを入れるときはloc
    features["pid"] = int(user_sample.iloc[0]["pid"])
    
    n_of_days = Counter(user_sample['day_of_week']) 
    for key in n_of_days.keys():
        day_of_week_counts["day"+str(key)] += n_of_days[key]
    features.update(day_of_week_counts)
    
    features["in_store"] = np.sum(user_sample["in_store"])
    
    all_results.append(features)
    return all_results

In [90]:
def get_log_info(userlog):
    compiled_log = []

    for i, (ses_id, user_sample) in tqdm(enumerate(userlog.groupby(['session_id', 'pid'], sort=False)), total=userlog.session_id.nunique(), desc='session_id', position=0):
        compiled_log += get_logdata(user_sample)
    reduced_log = pd.DataFrame(compiled_log)
    return reduced_log 
reduced_log = get_log_info(userlog)

HBox(children=(IntProgress(value=0, description='session_id', max=5601, style=ProgressStyle(description_width=…




In [91]:
def postprocess(train, test, reduced_log, unique_sess):
    drop_features = ["imid", "pid"]
    new_train = pd.merge(train, reduced_log, on =["session_id", "pid"], how = "left")
    new_test = pd.merge(test, reduced_log, on =["session_id", "pid"], how = "left")
    new_train["yellow_in_pic"] = list_y[:6612]
    new_test["yellow_in_pic"] = list_y[6612:]
    pid_count_mean = train.groupby('pid').target.count()
    new_train['pid_count_enc'] = new_train['pid'].map(pid_count_mean)
    new_test['pid_count_enc'] = new_test['pid'].map(pid_count_mean)
    new_train = pd.merge(new_train, unique_sess, on ="pid", how = "left")
    new_test = pd.merge(new_test, unique_sess, on ="pid", how = "left")
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(new_train["session_id"]))
    new_train["session_id"] = lbl.transform(list(new_train["session_id"]))
    new_train.drop(drop_features, axis=1, inplace=True)
    new_test.drop(drop_features, axis=1, inplace=True)
    print(new_train.shape, new_test.shape)
    return new_train, new_test
new_train, new_test = postprocess(train, test, reduced_log, unique_sess)

(6612, 34) (6799, 33)


In [92]:
new_train.head(3)

Unnamed: 0,target,session_id,store_lat,store_lon,radius,type,name,OS,accumulated_actions,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,categorical_6,closest_lat,closest_lon,day0,day1,day2,day3,day4,day5,day6,in_store,lang,max_dist,min_dist,optout_count,std_dist,timezone,yellow_in_pic,pid_count_enc,uni_sess_per_pid
0,0,2315,36.320751,46.104755,25,2,0,1,187,2,3,2,1,4,7,36.320923,46.104648,0,0,0,0,187,0,0,149,7,0.072056,0.000203,0,0.014947,1,4,15,26
1,0,2279,35.564913,46.744633,51,0,1,0,111,1,3,1,1,2,18,35.565118,46.74396,111,0,0,0,0,0,0,111,9,0.005153,0.000704,0,0.00058,1,3,18,37
2,0,959,35.693777,46.784288,189,0,1,0,131,1,3,1,0,3,1,35.693449,46.784401,0,0,0,0,0,0,131,131,9,0.012322,0.000347,0,0.003843,1,1,11,21


In [93]:
#new_train["session_id"].value_counts()
new_train[new_train.session_id == 2792]

Unnamed: 0,target,session_id,store_lat,store_lon,radius,type,name,OS,accumulated_actions,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,categorical_6,closest_lat,closest_lon,day0,day1,day2,day3,day4,day5,day6,in_store,lang,max_dist,min_dist,optout_count,std_dist,timezone,yellow_in_pic,pid_count_enc,uni_sess_per_pid
5,0,2792,35.174939,44.114999,26,3,6,1,583,1,3,1,0,1,18,35.175385,44.114292,583,0,0,0,0,0,0,5,7,0.222833,0.000836,0,0.037592,1,0,3,6
186,0,2792,35.140178,43.949223,57,4,4,1,583,1,3,1,0,1,18,35.140472,43.949955,583,0,0,0,0,0,0,15,7,0.19212,0.000789,0,0.038201,1,1,2,3
299,0,2792,35.167237,43.974969,39,0,1,1,583,1,3,1,0,1,18,35.167419,43.975208,583,0,0,0,0,0,0,31,7,0.163985,0.0003,0,0.037623,1,2,18,33
529,0,2792,35.169238,43.97168,179,0,1,1,583,1,3,1,0,1,18,35.168945,43.970814,583,0,0,0,0,0,0,365,7,0.167245,0.000914,0,0.038136,1,3,20,36
1252,0,2792,35.16284,44.076985,68,0,1,1,583,1,3,1,0,1,18,35.163452,44.077213,583,0,0,0,0,0,0,8,7,0.185874,0.000654,0,0.0281,1,0,5,7
1673,0,2792,35.160668,44.06401,32,4,4,1,583,1,3,1,0,1,18,35.160309,44.063755,583,0,0,0,0,0,0,6,7,0.173277,0.00044,0,0.025798,1,0,3,10
1889,0,2792,35.136646,44.053124,192,0,1,1,583,1,3,1,0,1,18,35.137482,44.05344,583,0,0,0,0,0,0,31,7,0.167704,0.000894,0,0.021948,1,1,5,9
2008,0,2792,35.169985,43.970376,192,0,1,1,583,1,3,1,0,1,18,35.169983,43.970295,583,0,0,0,0,0,0,373,7,0.168562,8.2e-05,0,0.038294,1,3,27,44
2810,0,2792,35.166897,44.035417,48,0,1,1,583,1,3,1,0,1,18,35.167084,44.035953,583,0,0,0,0,0,0,29,7,0.144118,0.000568,0,0.024305,1,1,6,12
2815,0,2792,35.167615,43.996665,69,2,0,1,583,1,3,1,0,1,18,35.168121,43.996998,583,0,0,0,0,0,0,186,7,0.142286,0.000606,0,0.033963,1,7,16,32


# modelling

## lgb

In [94]:
categoricals = ['lang', 'OS']

lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','learning_rate': 0.017891320270412462,'max_depth': 5, 'random_seed':42}
    
lgbm_params2 = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial',   'learning_rate': 0.01861754102536491,'random_seed':43, 'max_depth': 4}

lgbm_params3 = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial', 'learning_rate': 0.09482045811322946, 'random_seed':44,'max_depth': 3}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    remove_features = ["session_id", "categorical_1", "categorical_2", "categorical_3"]
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=4
    skf=StratifiedKFold(n_splits = n_folds)
    models = []
    
    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    evals_result = {}
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        #lgbm_params["random_seed"] = initial + i
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]

        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500, categorical_feature = categoricals)
        #clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],num_boost_round=350, categorical_feature = categoricals, verbose_eval = 100)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()

        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    roc = roc_auc_score(real, valid)
    print("ROC = {}".format(roc_auc_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return roc, pred_value, feature_importance_df, valid_lgb
    
roc1, pred_value, _, _ = modelling(new_train, new_test, lgbm_params)
roc2, pred_value2, feature_importance_df, valid_lgb = modelling(new_train, new_test, lgbm_params2)
roc3, pred_value3, _, _ = modelling(new_train, new_test, lgbm_params3)

Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[110]	training's auc: 0.99674	valid_1's auc: 0.862727
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[61]	training's auc: 0.986028	valid_1's auc: 0.870604
Fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[166]	training's auc: 0.998	valid_1's auc: 0.868275
Fold 4
Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.999998	valid_1's auc: 0.88532
Early stopping, best iteration is:
[502]	training's auc: 1	valid_1's auc: 0.885395
ROC = 0.8542213304522303
[[6474    5]
 [ 127    6]]
Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[118]	training's auc: 0.984883	valid_1's auc: 0.859323
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[

In [74]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices
        
def modelling_sgk(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)
    groups = np.array(X_train.session_id.values)
    X_test = X_test.drop("session_id", axis=1)
    models = []
    
    n_folds = 5
    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    evals_result = {}
    features_list = [i for i in X_train.columns if i != "session_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i, (train_index, test_index) in enumerate(stratified_group_k_fold(X_train, y_train, groups, k=n_folds, seed=12)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2.drop("session_id", axis=1, inplace=True)

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        X_test2.drop("session_id", axis=1, inplace=True)

        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()

        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    roc = roc_auc_score(real, valid)
    print("ROC = {}".format(roc_auc_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return roc, pred_value, feature_importance_df, valid
    
#roc_sgk1, pred_value, _, valid1 = modelling_sgk(new_train, new_test, lgbm_params)
#roc_sgk2, pred_value2, _, valid2 = modelling_sgk(new_train, new_test, lgbm_params2)
#roc_sgk3, pred_value3, _, valid3 = modelling_sgk(new_train, new_test, lgbm_params3)

In [122]:
feature_importance_df.sort_values("Cv", ascending = True).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Average,Std,Cv
0,yellow_in_pic,101,133,125,88,111.75,18.074499,0.16174
1,categorical_4,45,42,25,40,38.0,7.713624,0.20299
2,name,25,29,40,24,29.5,6.344289,0.215061
3,accumulated_actions,167,416,346,274,300.75,92.106935,0.306257
4,uni_sess_per_pid,256,442,495,238,357.75,112.504167,0.314477
5,max_dist,168,225,320,144,214.25,67.773059,0.316327
6,closest_lon,67,160,122,85,108.5,35.738635,0.329388
7,min_dist,277,680,636,407,500.0,165.28309,0.330566
8,categorical_6,51,154,127,120,113.0,37.980258,0.336108
9,store_lon,98,195,185,79,139.25,51.314593,0.368507


## catboost

In [95]:
categoricals = ['lang', 'OS']

cat_params = {
    'loss_function': 'Logloss',
    'depth': 4,
    "num_boost_round":100000,
    'learning_rate': 0.01,
    "early_stopping_rounds":10,
    "eval_metrics": 'AUC',
    "metrics": 'AUC'}

def modelling_cat(new_train, new_test, cat_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    remove_features = ["session_id", "categorical_1", "categorical_2", "categorical_3"]
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=10
    skf=StratifiedKFold(n_splits = n_folds)
    models = []
    
    valid = pd.DataFrame(np.zeros([X_train.shape[0]]))
    evals_result = {}
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]

        train_pool = Pool(X_train2, label=y_train2)
        test_pool = Pool(X_test2, label=y_test2)
        clf = CatBoost(cat_params)
        clf.fit(train_pool, eval_set=[train_pool, test_pool], use_best_model=True, verbose_eval = 300)
        
        valid.iloc[test_index]  = clf.predict(X_test2, prediction_type = "Probability")[:,1].reshape(X_test2.shape[0], 1)
        #feature_importance_df["Fold_"+str(i+1)] = clf.get_feature_importance()

        models.append(clf)
        
    #feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    #feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    #feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    roc = roc_auc_score(y_train, valid)
    print("ROC = {}".format(roc_auc_score(y_train, valid)))
    print(confusion_matrix(y_train, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, prediction_type = "Probability")[:,1] / len(models)
    return roc, pred_value, valid
    
#roc_cat, pred_value_cat, valid_cat = modelling_cat(new_train, new_test, cat_params)

In [123]:
print(new_train.columns)

Index(['target', 'session_id', 'store_lat', 'store_lon', 'radius', 'type',
       'name', 'OS', 'accumulated_actions', 'categorical_1', 'categorical_2',
       'categorical_3', 'categorical_4', 'categorical_5', 'categorical_6',
       'closest_lat', 'closest_lon', 'day0', 'day1', 'day2', 'day3', 'day4',
       'day5', 'day6', 'in_store', 'lang', 'max_dist', 'min_dist',
       'optout_count', 'std_dist', 'timezone', 'yellow_in_pic',
       'pid_count_enc', 'uni_sess_per_pid'],
      dtype='object')


# submission

In [96]:
final_pred = pred_value3
roc = roc3
sample_submission = pd.read_csv("../../../20200125atma/input/atmacup3_sample_submission.csv")
sample_submission["target"] = final_pred
sample_submission.to_csv("../../../20200125atma/result/atmacup3_sample_submission"+str(roc)+".csv", index = False)

- remove hyperopt, ensemble
- groupby by session_id and pid when making data to consider different behavior per person and store