In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
from tqdm import tqdm
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.metrics.pairwise import cosine_similarity

%config InlineBackend.figure_format = 'retina' # set 'png' here when working on notebook
%matplotlib inline

cache_dir='cache'
flag=True

## 加载数据

In [5]:
# 提取时间结构信息
def make_struct_time(df):
    time_infos = []
    for t in tqdm(df.time_stamp):
        _struct_time = time.strptime(t, '%Y-%m-%d %H:%M')
        time_infos.append(
            [_struct_time.tm_hour + _struct_time.tm_min / 60., _struct_time.tm_hour, _struct_time.tm_wday])
    time_infos = pd.DataFrame(time_infos, columns=['time', 'hour', 'wday'])
    df = pd.concat((df, time_infos), axis=1)
    return df

# 按照强度降序排列wifi_bssid
def make_wifi_sorted(df, how='max', cache=False):
    cache_path = os.path.join(cache_dir, '加载数据', 'wifi_sorted_%d_%s.hdf' % (df.shape[0], how))
    if os.path.exists(cache_path) & flag & cache:
        result = pd.read_hdf(cache_path, 'data')
    else:
        print('wifi_bssid %s排序中...' % how)
        def _sort_wifi(wifi_infos):
            d = {}
            for i in wifi_infos.split(';'):
                infos = i.split('|')[:2]  # ['bssid','strength']
                strength = int(infos[1])
                if infos[0] not in d:
                    d.update({infos[0]: [strength]})
                else:
                    d[infos[0]].append(strength)
            _agg = (lambda x: np.mean(x)) if how == 'mean' else (lambda x: np.max(x))
            _serialize = lambda x: ';'.join(['%s|%d' % (i[0], i[1]) for i in x])  # 序列化
            d = [(i[0], _agg(i[1])) for i in d.items()]
            return _serialize(sorted(d, key=lambda x: x[1], reverse=True))
        result = df.wifi_infos.apply(_sort_wifi)
        if cache:
            result.to_hdf(cache_path, 'data')
    def _deserialize(x):
        # 反序列化
        result = []
        for i in x.split(';'):
            ls = i.split('|')
            ls[1] = int(ls[1])
            result.append(ls)
        return result
    df['wifi_sorted_%s' % how] = result.apply(_deserialize)
    return df

# 加载数据
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = make_struct_time(df)
    df = make_wifi_sorted(df, 'max', True)
    return df

meta_df = pd.read_csv('data/训练数据-ccf_first_round_shop_info.csv')
train_df = load_data('data/训练数据-ccf_first_round_user_shop_behavior.csv')
test_df = load_data('data/AB榜测试集-evaluation_public.csv')

# 加入商铺信息（测试集此时还没有候选集，没有shop_id）
train_df = pd.merge(train_df, meta_df[['shop_id', 'mall_id', 'category_id', 'price']], on='shop_id', how='left')
train_df['row_id'] = train_df.index


100%|██████████| 1138015/1138015 [00:15<00:00, 74165.40it/s]
100%|██████████| 483931/483931 [00:08<00:00, 56967.26it/s]


In [6]:
# 构造线下验证集
x_train=train_df[train_df.time_stamp<'2017-08-30 00:00']
x_val=train_df[train_df.time_stamp>='2017-08-30 00:00']

In [7]:
# 统计每个商场出现的wifi，并筛选掉出现在多个商场的wifi，如ChinaNet
mall_wifi=train_df.groupby('mall_id',as_index=False)['wifi_infos'].agg(lambda x:';'.join(x.wifi_infos))
ls=[]
for i in tqdm(mall_wifi.values):
    ls+=[(i[0],j.split('|')[0]) for j in i[1].split(';')]
mall_wifi=pd.DataFrame(ls,columns=['mall_id','wifi'])
wifi_mall_count=mall_wifi.groupby('wifi',as_index=False)['mall_id'].agg({'_count':lambda x:len(x.unique())})
wifi_mall_count.groupby('_count',as_index=False).count()

100%|██████████| 97/97 [00:07<00:00, 13.01it/s]


Unnamed: 0,_count,wifi
0,1,395574
1,2,3966
2,3,103
3,4,18
4,5,4
5,6,5
6,7,3
7,9,1
8,10,1
9,14,3


In [8]:
# 筛选掉出现在多个商场中的wifi
def wifi_filter(wifi_infos):
    result=pd.DataFrame([i.split('|')[:2] for i in wifi_infos.split(';')],columns=['wifi','strength'])
    result=pd.merge(result,wifi_mall_count[wifi_mall_count._count>3],on='wifi',how='left')
    result=result[result._count.isnull()]
    return ';'.join(['|'.join(i) for i in result[['wifi','strength']].values])

# 整合每个店铺对应的wi-fi信息
def make_aggregated_wifi(df):
    cache_path=os.path.join(cache_dir,'加载数据','aggregate_wifi_%d.hdf'% df.shape[0])
    if os.path.exists(cache_path) & flag:
        result=pd.read_hdf(cache_path,'data')
    else:
        result=df[['shop_id','wifi_infos']].groupby('shop_id',as_index=False).agg(lambda x:';'.join(x)) # x:pd.Series
        print('筛选wifi...')
        result.wifi_infos=result.wifi_infos.apply(wifi_filter)
        result=make_wifi_sorted(result,'max')
        result=make_wifi_sorted(result,'mean')
        result.to_hdf(cache_path,'data')
    return result

train_shop_wifi=make_aggregated_wifi(x_train)
test_shop_wifi=make_aggregated_wifi(train_df)

In [9]:
x_train.head()

Unnamed: 0,user_id,shop_id,time_stamp,longitude,latitude,wifi_infos,time,hour,wday,wifi_sorted_max,mall_id,category_id,price,row_id
0,u_376,s_2871718,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,21.333333,21,6,"[[b_6396479, -55], [b_5857369, -55], [b_639648...",m_1409,c_38,42,0
1,u_376,s_2871718,2017-08-06 21:20,122.308162,32.08797,b_6396480|-67|false;b_56328155|-73|false;b_411...,21.333333,21,6,"[[b_6396479, -57], [b_5857369, -57], [b_639648...",m_1409,c_38,42,1
2,u_1041,s_181637,2017-08-02 13:10,117.365255,40.638214,b_8006367|-78|false;b_2485110|-52|false;b_3005...,13.166667,13,2,"[[b_2485110, -52], [b_35013153, -56], [b_33503...",m_4079,c_30,40,2
3,u_1158,s_609470,2017-08-13 12:30,121.134451,31.197416,b_26250579|-73|false;b_26250580|-64|false;b_26...,12.5,12,6,"[[b_30424471, -60], [b_26250580, -64], [b_2625...",m_6587,c_27,49,3
4,u_1654,s_3816766,2017-08-25 19:50,122.255867,31.35132,b_39004150|-66|false;b_39004148|-58|false;b_21...,19.833333,19,4,"[[b_39004148, -58], [b_39004150, -66], [b_1845...",m_3005,c_6,62,4


## 构造候选shop_id集

In [10]:
# 利用多分类模型的结果构造二分类模型的候选集
def make_candidate_shops_by_multi(cache_suffix, nb_candidates=5):
    candidates = []
    probas = []
    for mall_id in tqdm(meta_df.mall_id.unique()):
        fname = os.path.join(cache_dir, 'multi', '{}_{}.hdf'.format(mall_id, cache_suffix))
        cache_file = pd.read_hdf(fname, 'data')
        for idx, row in cache_file.iterrows():
            for i in row[-nb_candidates:]:
                candidates.append((idx, i))
        cache_file = pd.read_hdf(fname, 'proba')
        probas += list(cache_file.values[:, -nb_candidates:].flatten())
    #         for idx,row in cache_file.iterrows():
    #             probas+=list(row[-nb_candidates:])
    result = pd.DataFrame(candidates, columns=['row_id', 'shop_id'])
    result['proba'] = probas
    return result

train_candidate = make_candidate_shops_by_multi('train', nb_candidates=5)
val_candidate = make_candidate_shops_by_multi('val', nb_candidates=7)
test_candidate = make_candidate_shops_by_multi('test', nb_candidates=7)


100%|██████████| 97/97 [02:49<00:00,  1.74s/it]
100%|██████████| 97/97 [00:12<00:00,  7.51it/s]
100%|██████████| 97/97 [01:17<00:00,  1.25it/s]


In [11]:
# 计算覆盖率
print('train:%s'%(pd.merge(train_candidate,x_train[['shop_id','row_id']],on=['shop_id','row_id'],how='inner').shape[0]/x_train.shape[0]))
print('val:%s'%(pd.merge(val_candidate,x_val[['shop_id','row_id']],on=['shop_id','row_id'],how='inner').shape[0]/x_val.shape[0]))
# 7:0.9773179783692518
# 5:0.9723483219445374

train:0.9971196967357803
val:0.9780984692821076


## 构造训练/测试基础样本

In [12]:
train_basic=pd.merge(x_train,train_candidate,on='row_id',how='left')
label=(train_basic['shop_id_x']==train_basic['shop_id_y'])*1
train_basic.drop('shop_id_x',axis=1,inplace=True)
train_basic.rename(columns={'shop_id_y':'shop_id'},inplace=True)
train_basic=pd.merge(train_basic,train_shop_wifi,on='shop_id',how='left')

In [13]:
val_basic=pd.merge(x_val,val_candidate,on='row_id',how='left')
val_label=(val_basic['shop_id_x']==val_basic['shop_id_y'])*1
val_basic.drop('shop_id_x',axis=1,inplace=True)
val_basic.rename(columns={'shop_id_y':'shop_id'},inplace=True)
val_basic=pd.merge(val_basic,train_shop_wifi,on='shop_id',how='left')

In [14]:
test_basic=pd.merge(test_df,test_candidate,on='row_id',how='left')
test_basic=pd.merge(test_basic,test_shop_wifi,on='shop_id',how='left')
test_basic=pd.merge(test_basic,meta_df[['shop_id','category_id','price']],on='shop_id',how='left')

In [15]:
train_basic.head()

Unnamed: 0,user_id,time_stamp,longitude,latitude,wifi_infos_x,time,hour,wday,wifi_sorted_max_x,mall_id,category_id,price,row_id,shop_id,proba,wifi_infos_y,wifi_sorted_max_y,wifi_sorted_mean
0,u_376,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,21.333333,21,6,"[[b_6396479, -55], [b_5857369, -55], [b_639648...",m_1409,c_38,42,0,s_417478,0.000569,b_2181057|-61;b_11645233|-80;b_1769723|-82;b_1...,"[[b_43477317, -24], [b_2180879, -33], [b_22047...","[[b_43477317, -27], [b_22047137, -35], [b_2279..."
1,u_376,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,21.333333,21,6,"[[b_6396479, -55], [b_5857369, -55], [b_639648...",m_1409,c_38,42,0,s_3699,0.000595,b_56326189|-61;b_30230712|-62;b_25137940|-51;b...,"[[b_30230709, -15], [b_30230710, -25], [b_5632...","[[b_35548532, -32], [b_41781150, -35], [b_1877..."
2,u_376,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,21.333333,21,6,"[[b_6396479, -55], [b_5857369, -55], [b_639648...",m_1409,c_38,42,0,s_43525,0.000723,b_45567231|-79;b_1084756|-56;b_31131459|-63;b_...,"[[b_56323889, -27], [b_56323890, -27], [b_1035...","[[b_56323890, -41], [b_10657410, -42], [b_2212..."
3,u_376,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,21.333333,21,6,"[[b_6396479, -55], [b_5857369, -55], [b_639648...",m_1409,c_38,42,0,s_580441,0.002164,b_32053319|-35;b_56326644|-49;b_25137941|-85;b...,"[[b_15278309, -23], [b_32053319, -27], [b_5632...","[[b_15278309, -31], [b_32053319, -38], [b_5632..."
4,u_376,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,21.333333,21,6,"[[b_6396479, -55], [b_5857369, -55], [b_639648...",m_1409,c_38,42,0,s_2871718,0.979752,b_6396480|-67;b_41124514|-86;b_28723327|-90;b_...,"[[b_26748506, -26], [b_6396479, -43], [b_58573...","[[b_50744804, -49], [b_5857369, -53], [b_63964..."


In [20]:
#构建排序的shop_wifi dict
shop_info=pd.read_csv('data/shop_info.csv')
shop_info=shop_info[['shop_id','wifi_avgdis_shop']].values
shop_info_dict={}
for i in tqdm(range(shop_info.shape[0])):
    shop_wifi_dict={}
    avg_wifis=eval(shop_info[i,1])
    if type(avg_wifis)!=int:
        sorted(avg_wifis.items(),key=lambda x:x[1],reverse=True)
        lenwifi=len(avg_wifis.keys())
        for rank,k in enumerate(avg_wifis):
            #归一化到0~9
            if lenwifi==1:
                shop_wifi_dict[k]=0
            else:
                shop_wifi_dict[k]=(rank/(lenwifi-1))*9
    shop_info_dict[shop_info[i,0]]=shop_wifi_dict

100%|██████████| 8477/8477 [00:03<00:00, 2230.78it/s]


## 加入特征

In [23]:
def apk(actual, predicted, k=10, on_actual=True):
    """
    actual : A list of elements that are to be predicted (order doesn't matter)
    predicted : A list of predicted elements (order does matter)
    """
    if len(predicted)>k:
        predicted = predicted[:k]
    if on_actual and len(actual)>k:
        actual = actual[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def add_shop_count(df1,df2):
    shop_count=df1.groupby('shop_id',as_index=False)['shop_id'].agg({'shop_count':'count'})
    result=pd.merge(df2,shop_count,on='shop_id',how='left')
    return result

def add_mall_heat(df1,df2):
    mall_heat=df1.groupby('mall_id',as_index=False)['mall_id'].agg({'mall_heat':'count'})
    result=pd.merge(df2,mall_heat,on='mall_id',how='left')
    return result

def add_user_power(df1,df2):
    user_power=df1.groupby(['user_id'],as_index=False)['price'].agg({'mean_power':'mean'})
    result=pd.merge(df2,user_power,on=['user_id'],how='left')
    return result

def add_category_time(df1,df2):
    category_time=df1[['category_id','time']].groupby(['category_id'],as_index=False)['time'].agg({'category_time':'mean'})
    result=pd.merge(df2,category_time,on='category_id',how='left')
    return result

def add_wifi_apk(df,k=10,on_actual=True,how='max'):
    name='wifi_apk_{}_{}_{}'.format(how,k,on_actual*1)
    cache_path=os.path.join(cache_dir,'构造特征/wifi_apk','{}_{}.hdf'.format(name,df.shape[0]))
    if os.path.exists(cache_path) & flag:
        result=pd.read_hdf(cache_path,'data')
    else:
        apks=[]
        col='wifi_sorted_max_y' if how=='max' else 'wifi_sorted_mean'
        _list=lambda x:[i[0] for i in x[:k]]
        for i in tqdm(df[['wifi_sorted_max_x',col]].values):
            if type(i[0])==list and type(i[1])==list:
                apks.append(apk(_list(i[1]),_list(i[0]),k,on_actual))
            else:
                apks.append(0)
        result=pd.Series(apks)
        result.to_hdf(cache_path,'data')
    df[name]=result
    return df

def add_wifi_rank_diff(df,k=10,how='max'):
    rank_list=[]
    for j in range(k):
        rank_list.append('wifi_rank_diff_'+str(j))
    result=pd.DataFrame(columns=rank_list,index=np.arange(df.shape[0])).values
    row=0
    _list0=lambda x:[i[0] for i in x[:k]]
    for i in tqdm(df[['wifi_sorted_max_x','shop_id']].values):
        w0=_list0(i[0])
        for index,w in enumerate(w0):
            if w in shop_info_dict[i[1]]:
                result[row,index]=abs(index-shop_info_dict[i[1]][w])
        row+=1
    result=pd.DataFrame(result,columns=rank_list,index=np.arange(df.shape[0]))
    df=pd.concat((df,result),axis=1)
    return df

def add_wifi_diff(df,how='max'):
    cache_path=os.path.join(cache_dir,'构造特征/wifi_diff','wifi_diff_{}_{}.hdf'.format(how,df.shape[0]))
    if os.path.exists(cache_path) & flag:
        result=pd.read_hdf(cache_path,'data')
    else:
        diffs=[]
        col='wifi_sorted_max_y' if how=='max' else 'wifi_sorted_mean'
        for i in tqdm(df[['wifi_sorted_max_x',col]].values):
            if type(i[0])==list and type(i[1])==list:
                large_sum=large_num=less_sum=less_num=0
                d1=dict(i[0])
                d2=dict(i[1])
                intersection_keys=d1.keys()&d2.keys()
                for key in intersection_keys:
                    value1=d1[key]
                    value2=d2[key]
                    if value1>=value2:
                        large_sum+=(value1-value2) # 越大越好
                        large_num+=1
                    else:
                        less_sum+=(value2-value1) # 越小越好
                        less_num+=1
                large_sum=large_sum/large_num if large_num>0 else 0
                less_sum=less_sum/less_num if less_num>0 else 0
                diffs.append([large_sum,large_num,less_sum,less_num])
            else:
                diffs.append([0]*4)
        cols=['{}_{}'.format(i,how) for i in ['large_sum','large_num','less_sum','less_num']]
        result=pd.DataFrame(diffs,columns=cols)
        result.to_hdf(cache_path,'data')
    df=pd.concat((df,result),axis=1)
    return df

def add_wifi_rfd(df,how='max'):
    cache_path=os.path.join(cache_dir,'构造特征/wifi_rfd','wifi_rfd_{}_{}.hdf'.format(how,df.shape[0]))
    if os.path.exists(cache_path) & flag:
        result=pd.read_hdf(cache_path,'data')
    else:
        rfds=[]
        col='wifi_sorted_max_y' if how=='max' else 'wifi_sorted_mean'
        _set=lambda x:set([i[0] for i in x])
        for i in tqdm(df[['wifi_sorted_max_x',col]].values):
            if type(i[0])==list and type(i[1])==list:
                d1=dict(i[0])
                d2=dict(i[1])
                intersection_keys=d1.keys()&d2.keys()
                nb_intersection=len(intersection_keys) # 交集数
                if nb_intersection>0:
                    nb_union=len(d1.keys()|d2.keys()) # 并集数
                    jaccard=np.log1p(nb_intersection/nb_union)
                    l1=l2=0
                    for key in intersection_keys:
                        value1=d1[key]
                        value2=d2[key]
                        l1+=np.abs(value1-value2)
                        l2+=(value1-value2)**2
                    p=1 # Jaccard系数
                    rfd1=l1/(nb_intersection+p*jaccard)
                    rfd2=np.sqrt(l2)/(nb_intersection+p*jaccard)
                    rfds.append([rfd1,rfd2])
                    continue
            rfds.append([111111]*2) # 最大距离
        cols=['{}_{}'.format(i,how) for i in ['rfd1','rfd2']]
        result=pd.DataFrame(rfds,columns=cols)
        result.to_hdf(cache_path,'data')
    df=pd.concat((df,result),axis=1)
    return df

# 计算两点之间距离
def cal_distance(lat1, lon1, lat2, lon2):
    dx = np.abs(lon1 - lon2)  # 经度差
    dy = np.abs(lat1 - lat2)  # 维度差
    b = (lat1 + lat2) / 2.0
    Lx = 6371004.0 * (dx / 57.2958) * np.cos(b / 57.2958)
    Ly = 6371004.0 * (dy / 57.2958)
    L = (Lx ** 2 + Ly ** 2) ** 0.5
    return L

def add_distance(df):
    result=pd.merge(df,meta_df[['shop_id','longitude','latitude']],on='shop_id',how='left')
    result['distance']=cal_distance(result['latitude_x'],result['longitude_x'],result['latitude_y'],result['longitude_y'])
    return result

def add_feats(df1,df2):
    result=add_shop_count(df1,df2)
    result=add_wifi_rank_diff(result,k=10,how='mean')
    result=add_mall_heat(df1,result)
    result=add_distance(result)
    result=add_user_power(df1,result)
    result=add_category_time(df1,result)
    result=add_wifi_apk(result,k=4,how='max')
    result=add_wifi_apk(result,k=10,how='max')
    result=add_wifi_diff(result,how='mean')
    result=add_wifi_rfd(result,how='mean')
    result['mean_power'].fillna(np.median(meta_df.price),inplace=True)
#     result.fillna(0,inplace=True)
    return result

flag=True
print('加入训练集特征...')
train_feat=add_feats(x_train,train_basic)
print('加入测试集特征...')
test_feat=add_feats(train_df,test_basic)
print('加入验证集特征...')
val_feat=add_feats(x_train,val_basic)

加入训练集特征...


100%|██████████| 5376170/5376170 [00:45<00:00, 119146.16it/s]


加入测试集特征...


100%|██████████| 3387517/3387517 [00:29<00:00, 115999.04it/s]


加入验证集特征...


100%|██████████| 439467/439467 [00:03<00:00, 120689.60it/s]


In [28]:
print(list(train_feat.columns))

['user_id', 'time_stamp', 'longitude_x', 'latitude_x', 'wifi_infos_x', 'time', 'hour', 'wday', 'wifi_sorted_max_x', 'mall_id', 'category_id', 'price', 'row_id', 'shop_id', 'proba', 'wifi_infos_y', 'wifi_sorted_max_y', 'wifi_sorted_mean', 'shop_count', 'wifi_rank_diff_0', 'wifi_rank_diff_1', 'wifi_rank_diff_2', 'wifi_rank_diff_3', 'wifi_rank_diff_4', 'wifi_rank_diff_5', 'wifi_rank_diff_6', 'wifi_rank_diff_7', 'wifi_rank_diff_8', 'wifi_rank_diff_9', 'mall_heat', 'longitude_y', 'latitude_y', 'distance', 'mean_power', 'category_time', 'wifi_apk_max_4_1', 'wifi_apk_max_10_1', 'large_sum_mean', 'large_num_mean', 'less_sum_mean', 'less_num_mean', 'rfd1_mean', 'rfd2_mean']


In [30]:
# 后处理
feat_cols=['time_gap','mean_power_gap','shop_heat']
feat_cols+=['wifi_apk_{}_{}'.format(how,i) for i in ['4_1','10_1'] for how in ['max']]
feat_cols+=['{}_{}'.format(i,how) for i in ['large_sum','large_num','less_sum','less_num'] for how in ['mean']]
feat_cols+=['{}_{}'.format(i,how) for i in ['rfd1','rfd2'] for how in ['mean']]
feat_cols+=['distance','proba']
feat_cols+=['wifi_rank_diff_{}'.format(i) for i in np.arange(10)]
print(feat_cols)

def post_precess(df):
    df['shop_heat']=df['shop_count']/df['mall_heat']
    df['mean_power_gap']=df['price']-df['mean_power']
    df['time_gap']=np.abs(df['time']-df['category_time'])
    result=df[feat_cols]
#     result.fillna(0,inplace=True)
    return result

train=post_precess(train_feat)
test=post_precess(test_feat)
val=post_precess(val_feat)

['time_gap', 'mean_power_gap', 'shop_heat', 'wifi_apk_max_4_1', 'wifi_apk_max_10_1', 'large_sum_mean', 'large_num_mean', 'less_sum_mean', 'less_num_mean', 'rfd1_mean', 'rfd2_mean', 'distance', 'proba', 'wifi_rank_diff_0', 'wifi_rank_diff_1', 'wifi_rank_diff_2', 'wifi_rank_diff_3', 'wifi_rank_diff_4', 'wifi_rank_diff_5', 'wifi_rank_diff_6', 'wifi_rank_diff_7', 'wifi_rank_diff_8', 'wifi_rank_diff_9']


## 模型训练

In [60]:
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb

params = {
    'objective': ['binary'],
    'metric': ['binary_logloss'],
    'learning_rate':[0.05],
    'feature_fraction': [0.6],
    'max_depth': [12],
    'num_leaves':[140], # 1024效果略微更好
    'bagging_fraction': [0.8],
    'bagging_freq':[5],
    'min_data_in_leaf':[10],
    'min_gain_to_split':[0],
    'lambda_l1':[1],
    'lambda_l2':[1],
    'verbose':[0],
    'is_unbalance':[True]
}
params=list(ParameterGrid(params))

In [61]:
# ll=[]
mall_list=[]
iteration_list=[]
for mall_id in tqdm(meta_df.mall_id.unique()):
    lgbtrain=lgb.Dataset(train[train_feat.mall_id==mall_id].values,label[train_feat.mall_id==mall_id],feature_name=feat_cols)
    lgbeval = lgb.Dataset(val[val_feat.mall_id==mall_id].values, val_label[val_feat.mall_id==mall_id], reference=lgbtrain, feature_name=feat_cols)
    model = lgb.train(params[0], lgbtrain, valid_sets=lgbeval,early_stopping_rounds=50,num_boost_round=500)
    # 测试
#     proba=model.predict(test[test_feat.mall_id==mall_id])
#     ans=pd.DataFrame({'row_id':test_basic.loc[test_basic.mall_id==mall_id,'row_id'],'shop_id':test_basic.loc[test_basic.mall_id==mall_id,'shop_id'],'proba':proba})
#     tmp=ans.sort_values('proba').groupby('row_id',as_index=False).tail(1)
#     tmp=pd.merge(test_df.loc[test_df.mall_id==mall_id,['row_id']],tmp[['row_id','shop_id']],on='row_id',how='left')
#     tmp.fillna('s_4941',inplace=True)
    # 验证
    proba=model.predict(val[val_feat.mall_id==mall_id].values)
    ans=pd.DataFrame({'row_id':val_basic.loc[val_basic.mall_id==mall_id,'row_id'],'shop_id':val_basic.loc[val_basic.mall_id==mall_id,'shop_id'],'proba':proba})
    tmp=ans.sort_values('proba').groupby('row_id',as_index=False).tail(1)
    tmp=pd.merge(x_val.loc[x_val.mall_id==mall_id,['row_id']],tmp[['row_id','shop_id']],on='row_id',how='left')
    tmp.fillna('s_4941',inplace=True)
    ll.append(tmp)
    mall_list.append(mall_id)
    iteration_list.append(model.best_iteration)
result_mall=pd.DataFrame([mall_list,iteration_list],index=['mall_id','best_iteration'])
result_mall=result_mall.transpose()
result_mall.to_csv('best_iterations.csv')


  0%|          | 0/97 [00:00<?, ?it/s][A

[1]	valid_0's binary_logloss: 0.647036
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.612906



Exception in thread Thread-25:
Traceback (most recent call last):
  File "/root/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/root/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/root/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



[3]	valid_0's binary_logloss: 0.580841
[4]	valid_0's binary_logloss: 0.54549
[5]	valid_0's binary_logloss: 0.520416
[6]	valid_0's binary_logloss: 0.497361
[7]	valid_0's binary_logloss: 0.468792
[8]	valid_0's binary_logloss: 0.442513
[9]	valid_0's binary_logloss: 0.418253
[10]	valid_0's binary_logloss: 0.395806
[11]	valid_0's binary_logloss: 0.379134
[12]	valid_0's binary_logloss: 0.364229
[13]	valid_0's binary_logloss: 0.345662
[14]	valid_0's binary_logloss: 0.328502
[15]	valid_0's binary_logloss: 0.312433
[16]	valid_0's binary_logloss: 0.297479
[17]	valid_0's binary_logloss: 0.283145
[18]	valid_0's binary_logloss: 0.270039
[19]	valid_0's binary_logloss: 0.257693
[20]	valid_0's binary_logloss: 0.248794
[21]	valid_0's binary_logloss: 0.237587
[22]	valid_0's binary_logloss: 0.22988
[23]	valid_0's binary_logloss: 0.220116
[24]	valid_0's binary_logloss: 0.213044
[25]	valid_0's binary_logloss: 0.204233
[26]	valid_0's binary_logloss: 0.199759
[27]	valid_0's binary_logloss: 0.191517
[28]	vali

  1%|          | 1/97 [08:16<13:15:10, 496.98s/it]

[1]	valid_0's binary_logloss: 0.646252
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.607676
[3]	valid_0's binary_logloss: 0.571868
[4]	valid_0's binary_logloss: 0.536105
[5]	valid_0's binary_logloss: 0.509062
[6]	valid_0's binary_logloss: 0.482713
[7]	valid_0's binary_logloss: 0.454207
[8]	valid_0's binary_logloss: 0.428014
[9]	valid_0's binary_logloss: 0.403782
[10]	valid_0's binary_logloss: 0.381307
[11]	valid_0's binary_logloss: 0.362434
[12]	valid_0's binary_logloss: 0.345212
[13]	valid_0's binary_logloss: 0.326763
[14]	valid_0's binary_logloss: 0.309805
[15]	valid_0's binary_logloss: 0.293796
[16]	valid_0's binary_logloss: 0.278903
[17]	valid_0's binary_logloss: 0.264952
[18]	valid_0's binary_logloss: 0.25198
[19]	valid_0's binary_logloss: 0.23981
[20]	valid_0's binary_logloss: 0.229506
[21]	valid_0's binary_logloss: 0.218699
[22]	valid_0's binary_logloss: 0.209743
[23]	valid_0's binary_logloss: 0.200221
[24]	valid_0's binary_logloss

  2%|▏         | 2/97 [14:36<11:34:10, 438.42s/it]

[1]	valid_0's binary_logloss: 0.646266
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.609029
[3]	valid_0's binary_logloss: 0.573349
[4]	valid_0's binary_logloss: 0.537362
[5]	valid_0's binary_logloss: 0.509764
[6]	valid_0's binary_logloss: 0.483272
[7]	valid_0's binary_logloss: 0.454793
[8]	valid_0's binary_logloss: 0.428624
[9]	valid_0's binary_logloss: 0.404384
[10]	valid_0's binary_logloss: 0.381959
[11]	valid_0's binary_logloss: 0.363535
[12]	valid_0's binary_logloss: 0.346817
[13]	valid_0's binary_logloss: 0.328373
[14]	valid_0's binary_logloss: 0.311246
[15]	valid_0's binary_logloss: 0.295188
[16]	valid_0's binary_logloss: 0.280261
[17]	valid_0's binary_logloss: 0.266266
[18]	valid_0's binary_logloss: 0.253262
[19]	valid_0's binary_logloss: 0.241006
[20]	valid_0's binary_logloss: 0.23111
[21]	valid_0's binary_logloss: 0.220268
[22]	valid_0's binary_logloss: 0.211737
[23]	valid_0's binary_logloss: 0.202047
[24]	valid_0's binary_loglos




KeyboardInterrupt: 

In [None]:
# 验证集acc
pd.merge(x_val[['row_id','shop_id']],pd.concat(ll),on=['row_id','shop_id'],how='inner').shape[0]/x_val.shape[0] 
# raw:0.9140663576559788  0.9135725776907025
# raw+proba:0.9151017027444609
# only proba:0.907  +rfd:0.9141937847437919 +apk:0.9158981220432934

In [None]:
# 特征重要程度
sns.set({'figure.figsize':(16,8)})
sns.barplot(data=pd.DataFrame({'feat_name':train.columns,'feat_importance':model.feature_importance()}),x='feat_importance',y='feat_name')

In [17]:
# 保存测试集结果
result=pd.concat(ll)
print(result.shape)
result.to_csv('result.csv',index=False,sep=',')

(483931, 2)
