In [5]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
import lightgbm as lgb
import xgboost as xgb
from functools import partial
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from hyperopt import hp, tpe, Trials, fmin, space_eval
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random
import cv2

In [2]:
train = pd.read_csv("../../../20200125atma/input/train.csv")
test = pd.read_csv("../../../20200125atma/input/test.csv")
userlog = pd.read_csv("../../../20200125atma/input/user_log.csv")
poi = pd.read_csv("../../../20200125atma/input/poi.csv")

In [7]:
list_y = []
for i in range(13411):
    img = cv2.imread('../../../20200125atma/input/images/'+str(i)+'.png')
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)  # HSV 色空間に変換

    red = cv2.inRange(hsv, np.array([145, 70, 0]), np.array([180, 255, 255]))
    yellow = cv2.inRange(hsv, (15,0,0), (36, 255, 255))
    green = cv2.inRange(hsv, np.array([30, 190, 0]), np.array([90, 255, 255]))
    blue = cv2.inRange(hsv, np.array([108, 121, 0]), np.array([120, 255, 255]))
    white = cv2.inRange(hsv, np.array([108, 21, 0]), np.array([255, 70, 255]))

    # 白だけゴミがあるので、収縮演算
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    white = cv2.erode(yellow, kernel)

    bin_imgs = {'red': red, 'yellow': yellow, 'green': green,
            'blue': blue, 'white': white}

    for label, bin_img in bin_imgs.items():
        contours, _ = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        contours = list(filter(lambda cnt: len(cnt) > 30, contours))
        count = len(contours)
    
        if label == "yellow":
            list_y.append(count)

In [8]:
def preprocess(train, test, userlog, poi):
    userlog["sysname"][userlog.sysname == "ANDROID"] = "Android"
    userlog["lang"][userlog.lang == "ja_JP"] = "ja_JP"
    poi = poi.rename(columns={"latitude": "store_lat", "longitude": "store_lon"})
    train = pd.merge(train, poi, on ="pid", how = "left")
    test = pd.merge(test, poi, on ="pid", how = "left")
    userlog = pd.merge(userlog, train[["session_id", "store_lat", "store_lon", "radius"]], on="session_id", how="left")
    userlog = pd.merge(userlog, test[["session_id", "store_lat", "store_lon", "radius"]], on="session_id", how="left")
    userlog["store_lat"] = np.nanmax(userlog[["store_lat_x", "store_lat_y"]], axis=1)
    userlog["store_lon"] = np.nanmax(userlog[["store_lon_x", "store_lon_y"]], axis=1)
    userlog["radius"] = np.nanmax(userlog[["radius_x", "radius_y"]], axis=1)
    drop_features = ["store_lat_x", "store_lat_y", "store_lon_x", "store_lon_y", "radius_x", "radius_y"]
    userlog.drop(drop_features, axis=1, inplace=True)
    userlog["distance"] = np.sqrt((userlog["latitude"]- userlog["store_lat"])**2 + (userlog["longitude"]-userlog["store_lon"])** 2 )
    userlog["time"] = userlog["hour"].map(str) + str(":") + userlog["minute"].map(str) + str(":") + userlog["second"].map(str)
    userlog["time"] = pd.to_datetime(userlog['time'],format= '%H:%M:%S' )
    userlog = userlog.sort_values(["session_id", "hour", "minute", "second"]).reset_index(drop=True)
    userlog["virtual_dis"] = 6370 * np.arccos(np.sin(userlog["latitude"])*np.sin(userlog["store_lat"]) + np.cos(userlog["latitude"])*np.cos(userlog["store_lat"])*np.cos(userlog["longitude"]-userlog["store_lon"]))
    userlog["in_store"] = userlog["virtual_dis"] < userlog["radius"]
    return train, test, userlog, poi
train, test, userlog, poi = preprocess(train, test, userlog, poi)

In [9]:
def encode(train, test, userlog):
    os_list =  sorted(list(set(userlog['sysname'].unique())))
    os_map = dict(zip(os_list, np.arange(len(os_list))))
    userlog["sysname"] = userlog["sysname"].map(os_map)
    
    lang_list =  sorted(list(set(userlog['lang'].unique())))
    lang_map = dict(zip(lang_list, np.arange(len(lang_list))))
    userlog["lang"] = userlog["lang"].map(lang_map)
    
    timezone_list =  sorted(list(set(userlog['timezone'].unique())))
    timezone_map = dict(zip(timezone_list, np.arange(len(timezone_list))))
    userlog["timezone"] = userlog["timezone"].map(timezone_map)

    return train, test, userlog
train, test, userlog = encode(train, test, userlog)

In [None]:
userlog.head(3)

Unnamed: 0,latitude,longitude,sysname,optout,lang,timezone,session_id,hour,minute,second,day_of_week,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,categorical_6,store_lat,store_lon,radius,distance,time,virtual_dis,in_store
0,35.6503,46.780509,0,0,9,1,0003f26df5d8b928b416fba58efd5c91,0,33,31,6,4,0,44,1.0,1,187,35.651739,46.776134,118.0,0.004606,1900-01-01 00:33:31,15.744307,True
1,35.650298,46.780512,0,0,9,1,0003f26df5d8b928b416fba58efd5c91,0,33,31,6,4,1,44,1.0,1,187,35.651739,46.776134,118.0,0.004609,1900-01-01 00:33:31,15.758882,True
2,35.650286,46.780524,0,0,9,1,0003f26df5d8b928b416fba58efd5c91,2,18,33,6,4,2,44,1.0,1,187,35.651739,46.776134,118.0,0.004624,1900-01-01 02:18:33,15.832117,True


In [10]:
def get_logdata(user_sample):
    day_of_week_counts = {"day"+str(day) : 0 for day in range(7)}

    all_results = []
    features = {"accumulated_actions":0}
    features["accumulated_actions"] = user_sample.shape[0]
    features["session_id"] = user_sample.iloc[0]["session_id"]
    features["OS"] = user_sample.iloc[0]["sysname"]
    features["lang"] = user_sample.iloc[0]["lang"]
    features["timezone"] = user_sample.iloc[0]["timezone"]
    features["optout_count"] = np.sum(user_sample["optout"])
    distance = np.array(user_sample["distance"])
    features["max_dist"] = np.nanmax(distance)
    features["min_dist"] = np.nanmin(distance)
    features["std_dist"] = np.nanstd(distance)
    
    n_of_days = Counter(user_sample['day_of_week']) 
    for key in n_of_days.keys():
        day_of_week_counts["day"+str(key)] += n_of_days[key]
    features.update(day_of_week_counts)
    
    features["in_store"] = np.sum(user_sample["in_store"])
    
    all_results.append(features)
    return all_results

In [11]:
def get_log_info(userlog):
    compiled_log = []

    for i, (ses_id, user_sample) in tqdm(enumerate(userlog.groupby('session_id', sort=False)), total=userlog.session_id.nunique(), desc='session_id', position=0):
        compiled_log += get_logdata(user_sample)
        reduced_log = pd.DataFrame(compiled_log)
    return reduced_log 
reduced_log = get_log_info(userlog)

HBox(children=(IntProgress(value=0, description='session_id', max=5601, style=ProgressStyle(description_width=…




In [17]:
def postprocess(train, test, reduced_log):
    drop_features = ["imid", "pid", "session_id"]
    new_train = pd.merge(train, reduced_log, on ="session_id", how = "left")
    new_test = pd.merge(test, reduced_log, on ="session_id", how = "left")
    new_train["yellow_in_pic"] = list_y[:6612]
    new_test["yellow_in_pic"] = list_y[6612:]
    pid_count_mean = train.groupby('pid').target.count()
    new_train['pid_count_enc'] = new_train['pid'].map(pid_count_mean)
    new_test['pid_count_enc'] = new_test['pid'].map(pid_count_mean)
    new_train.drop(drop_features, axis=1, inplace=True)
    new_test.drop(drop_features, axis=1, inplace=True)
    print(new_train.shape, new_test.shape)
    return new_train, new_test
new_train, new_test = postprocess(train, test, reduced_log)

(6612, 24) (6799, 23)


In [18]:
new_train.head(3)

Unnamed: 0,target,store_lat,store_lon,radius,type,name,OS,accumulated_actions,day0,day1,day2,day3,day4,day5,day6,in_store,lang,max_dist,min_dist,optout_count,std_dist,timezone,yellow_in_pic,pid_count_enc
0,0,36.320751,46.104755,25,2,0,1,187,0,0,0,0,187,0,0,149,7,0.072056,0.000203,0,0.014947,1,4,15
1,0,35.564913,46.744633,51,0,1,0,111,111,0,0,0,0,0,0,111,9,0.005153,0.000704,0,0.00058,1,3,18
2,0,35.693777,46.784288,189,0,1,0,262,0,0,0,0,0,0,262,262,9,0.014393,0.000347,0,0.004819,1,1,11


In [19]:
categoricals = ['lang', 'OS']

lgbm_params = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial','learning_rate': 0.017891320270412462,'max_depth': 5, 'random_seed':42,
 'min_data_in_leaf': 8,'min_sum_hessian_in_leaf': 17,'num_leaves': 17}
    
lgbm_params2 = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial', 'bagging_fraction': 0.6633984686903678, 'bagging_freq': 9,
 'colsample_bytree': 0.9881097320572887,'feature_fraction': 0.6691096601215081,
 'learning_rate': 0.02861754102536491,
 'max_depth': 25,
 'min_data_in_leaf': 101,
 'min_sum_hessian_in_leaf': 9,
 'num_leaves': 59}

lgbm_params3 = {'objective': 'binary','eval_metric': 'auc','metric': 'auc', 'boosting_type': 'gbdt',
 'tree_learner': 'serial', 'bagging_fraction': 0.4522886818593178,
 'bagging_freq': 3,'colsample_bytree': 0.5872391314121622,
 'feature_fraction': 0.9884182286365535,'learning_rate': 0.09482045811322946,
 'max_depth': 24,'min_data_in_leaf': 88,'min_sum_hessian_in_leaf': 13, 'num_leaves': 29}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds)
    models = []
    
    valid = np.array([])
    real = np.array([])
    evals_result = {}
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]

        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 500, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()

        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    roc = roc_auc_score(real, valid)
    print("ROC = {}".format(roc_auc_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return roc, pred_value, feature_importance_df
    
roc, pred_value, feature_importance_df = modelling(new_train, new_test, lgbm_params)
roc, pred_value2, feature_importance_df = modelling(new_train, new_test, lgbm_params2)
#roc, pred_value3, _ = modelling(new_train, new_test, lgbm_params3)

Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[273]	training's auc: 0.904875	valid_1's auc: 0.884045
Fold 2
Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.924245	valid_1's auc: 0.834176
Early stopping, best iteration is:
[438]	training's auc: 0.919833	valid_1's auc: 0.837377
Fold 3
Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.932811	valid_1's auc: 0.83782
Early stopping, best iteration is:
[471]	training's auc: 0.930832	valid_1's auc: 0.838363
Fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[306]	training's auc: 0.912228	valid_1's auc: 0.852861
Fold 5
Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.933116	valid_1's auc: 0.840748
Early stopping, best iteration is:
[784]	training's auc: 0.947573	valid_1's auc: 0.850282
ROC = 0.8486840654654075
[[6478    1

In [20]:
feature_importance_df.sort_values("Cv", ascending = True).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,pid_count_enc,57,106,67,68,109,81.4,21.675793,0.266287
1,type,5,5,11,7,12,8.0,2.966479,0.37081
2,yellow_in_pic,26,47,23,20,55,34.2,14.076931,0.411606
3,min_dist,64,160,74,78,186,112.4,50.365067,0.448088
4,std_dist,28,82,59,36,121,65.2,33.677292,0.516523
5,max_dist,29,111,45,35,124,68.8,40.300868,0.585768
6,radius,32,101,59,52,168,82.4,48.342942,0.586686
7,accumulated_actions,26,80,39,26,108,55.8,32.768277,0.587245
8,in_store,37,112,41,42,148,76.0,45.567532,0.599573
9,lang,1,1,1,2,0,1.0,0.632456,0.632456


In [47]:
#best_feat = list(feature_importance_df[feature_importance_df.Cv <= 1.1]["Feature"])

In [36]:
def my_hyperopt(X, Y):
    def para_tuning_obj(params):
        params = {
        'boosting_type': 'gbdt', 
        'metric': 'auc', 
        'objective': 'binary', 
        'eval_metric': 'auc', 
        "tree_learner": "serial",
        'max_depth': int(params['max_depth']),
        'bagging_freq': int(params['bagging_freq']),
        'bagging_fraction': float(params['bagging_fraction']),
        'num_leaves': int(params['num_leaves']),
        'feature_fraction': float(params['feature_fraction']),
        'learning_rate': float(params['learning_rate']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'min_sum_hessian_in_leaf': int(params['min_sum_hessian_in_leaf']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
}
    
        real = np.array([])
        pred = np.array([])
        skf = StratifiedKFold(n_splits=5)
        for trn_idx, val_idx in skf.split(X, Y):
            x_train, x_val = X.iloc[trn_idx, :], X.iloc[val_idx, :]
            y_train, y_val = Y.iloc[trn_idx], Y.iloc[val_idx]
            train_set = lgb.Dataset(x_train, y_train)
            val_set = lgb.Dataset(x_val, y_val)
        
            clf = lgb.train(params, train_set, num_boost_round = 100000, early_stopping_rounds = 100, 
                         valid_sets = [train_set, val_set], verbose_eval = 300)
            pred = np.concatenate((pred, np.array(clf.predict(x_val, num_iteration = clf.best_iteration))), axis=0) 
            real = np.concatenate((real, np.array(y_val)), axis=0) 
        score = roc_auc_score(real, pred)
    
        return -score

    trials = Trials()

    space ={
        'max_depth': hp.quniform('max_depth', 1, 30, 1),
        'bagging_freq': hp.quniform('bagging_freq', 1, 10, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.2, 1.0),
        'num_leaves': hp.quniform('num_leaves', 8, 128, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.2, 1.0),
        'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
        'min_data_in_leaf': hp.quniform('min_data_in_leaf', 8, 128, 1),
        'min_sum_hessian_in_leaf': hp.quniform('min_sum_hessian_in_leaf', 5, 30, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0)
    }

    best = fmin(para_tuning_obj, space = space, algo=tpe.suggest, max_evals=10, trials=trials, verbose=1)

    best_params = space_eval(space, best)
    return best_params

#X = new_train.drop(['target'],axis=1).copy()
#Y = new_train.target.copy()
#my_hyperopt(X, Y)

In [21]:
print(new_train.columns)

Index(['target', 'store_lat', 'store_lon', 'radius', 'type', 'name', 'OS',
       'accumulated_actions', 'day0', 'day1', 'day2', 'day3', 'day4', 'day5',
       'day6', 'in_store', 'lang', 'max_dist', 'min_dist', 'optout_count',
       'std_dist', 'timezone', 'yellow_in_pic', 'pid_count_enc'],
      dtype='object')


In [23]:
final_pred = (pred_value + pred_value2) / 2
sample_submission = pd.read_csv("../../../20200125atma/input/atmacup3_sample_submission.csv")
sample_submission["target"] = final_pred
sample_submission.to_csv("../../../20200125atma/result/atmacup3_sample_submission"+str(roc)+".csv", index = False)

In [None]:
#user = userlog[userlog.session_id == train.iloc[4854]["session_id"]].copy().reset_index(drop=True)  # train.iloc[0]["session_id"]でのuserlog
#user

# 上位解法

- 3rd
- lightgbm
- pidごとのsession_idのユニーク数がきいた →理由は不明
- 一番近い点にいた時間
- 近い点が何個あったか
- 緯度経度を標準化してから距離を計算した
- stratifiedgroupkfold
- parameterは、データは基本的なものだったので、max_depthは4くらいで浅めにした。
- rank averageとそうでないものの2種類
- 
- 2nd
- cat 8 gbm 2
- session中の最も近づいた時の距離
- 一定の距離にいた時間
- pidに対するtarget_encodingがきいた
- depth4にしただけで、素のモデルで攻めた
- early_stoppingをやめて、固定のエポックでやめる形にした
- dataに同一ユーザが実は同じユーザがいた
- 
- 1st
- 店舗に一番近い、２番目に近いデータの内容を付け加えた
