In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import pickle, os, re, operator, gc
from tqdm import tqdm
from multiprocessing import Pool
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
# import lightgbm as lgb
import xgboost as xgb

In [5]:
# load datasets
df_shopinfo = pd.read_csv('../data/shopInfo.txt', delimiter='\t')
df_shopcat = pd.read_csv('../data/shopCategoryInfo.txt', delimiter='\t')
df_trainset = pd.read_csv('../data/trainSampleInfo.txt', delimiter='\t')
df_testset = pd.read_csv('../data/testSampleInfo.txt', delimiter='\t')

df_shopinfo = df_shopinfo.merge(df_shopcat, on='category_id', how='left')
train = df_trainset.merge(df_shopinfo, how='left', on='shop_id')
train['row_id'] = train.index

# test = df_testset.merge(df_shopinfo, how='left', on='shop_id')
# test['row_id'] = test.index

In [7]:
train.head()

Unnamed: 0,shop_id,longitude_x,latitude_x,wifi_infos,build_id,category_id,layer,hot,longitude_y,latitude_y,category_name,row_id
0,-1,117.086205,39.393679,"M0000387352,-79;M0000599183,-80;M0000378675,-8...",,,,,,,,0
1,S03097,119.357683,26.043137,"M0000192366,-86;M0000736040,-71;M0000787437,-7...",B00098,C00004,1.0,7.0,119.357681,26.043318,购物,1
2,S01259,120.138415,30.277816,"M0000523931,-80;M0000284055,-83;M0000229302,-8...",,C00004,1000.0,10.0,120.143157,30.275619,购物,2
3,-1,117.238163,31.813781,"M0000928623,-47;M0000033605,-48;M0000394309,-5...",,,,,,,,3
4,S00453,114.580855,30.660663,"M0000925767,-35;M0000446747,-36;M0000420052,-4...",,C00006,1000.0,7.0,114.585922,30.657966,生活服务,4


In [8]:
build_id = 'B00098'

In [9]:
sub_train = train[train.build_id == build_id]

In [10]:
sub_train

Unnamed: 0,shop_id,longitude_x,latitude_x,wifi_infos,build_id,category_id,layer,hot,longitude_y,latitude_y,category_name,row_id
1,S03097,119.357683,26.043137,"M0000192366,-86;M0000736040,-71;M0000787437,-7...",B00098,C00004,1.0,7.0,119.357681,26.043318,购物,1
1122,S03054,119.357949,26.042583,"M0000206579,-72;M0000010195,-73;M0000920385,-7...",B00098,C00004,2.0,4.0,119.357457,26.042785,购物,1122
1459,S03054,119.357094,26.042793,"M0000778589,-65;M0000206579,-68;M0000104914,-7...",B00098,C00004,2.0,4.0,119.357457,26.042785,购物,1459
1608,S03097,119.357854,26.043286,"M0000911235,-58;M0000923071,-59;M0000789568,-6...",B00098,C00004,1.0,7.0,119.357681,26.043318,购物,1608
1765,S03097,119.357745,26.043318,"M0000562185,-49;M0000630779,-68;M0000075828,-7...",B00098,C00004,1.0,7.0,119.357681,26.043318,购物,1765
...,...,...,...,...,...,...,...,...,...,...,...,...
1201843,S03054,119.357450,26.042749,"M0000398830,-73;M0000504132,-77;M0000627902,-7...",B00098,C00004,2.0,4.0,119.357457,26.042785,购物,1201843
1202697,S03097,119.357781,26.043496,"M0000562185,-48;M0000630779,-68;M0000075828,-6...",B00098,C00004,1.0,7.0,119.357681,26.043318,购物,1202697
1203801,S03054,119.357350,26.043063,"M0000206579,-55;M0000772175,-57;M0000010195,-6...",B00098,C00004,2.0,4.0,119.357457,26.042785,购物,1203801
1204490,S03054,119.357595,26.042855,"M0000670552,-72;M0000770093,-73;M0000065922,-7...",B00098,C00004,2.0,4.0,119.357457,26.042785,购物,1204490


In [11]:
train_set = []
for index, row in sub_train.iterrows():
    wifi_dict = {}
    for wifi in row.wifi_infos.split(';'):
        bssid, signal = wifi.split(',')
        wifi_dict[bssid] = int(signal)
    train_set.append(wifi_dict)

In [13]:
v = DictVectorizer(sparse=False, sort=False)

In [14]:
train_set = v.fit_transform(train_set)

In [16]:
train_set[train_set == 0] = np.NaN

In [17]:
sub_train = pd.concat([sub_train.reset_index(), pd.DataFrame(train_set)], axis=1)

In [18]:
sub_train

Unnamed: 0,index,shop_id,longitude_x,latitude_x,wifi_infos,build_id,category_id,layer,hot,longitude_y,...,787,788,789,790,791,792,793,794,795,796
0,1,S03097,119.357683,26.043137,"M0000192366,-86;M0000736040,-71;M0000787437,-7...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,
1,1122,S03054,119.357949,26.042583,"M0000206579,-72;M0000010195,-73;M0000920385,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,
2,1459,S03054,119.357094,26.042793,"M0000778589,-65;M0000206579,-68;M0000104914,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,
3,1608,S03097,119.357854,26.043286,"M0000911235,-58;M0000923071,-59;M0000789568,-6...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,
4,1765,S03097,119.357745,26.043318,"M0000562185,-49;M0000630779,-68;M0000075828,-7...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2590,1201843,S03054,119.357450,26.042749,"M0000398830,-73;M0000504132,-77;M0000627902,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,
2591,1202697,S03097,119.357781,26.043496,"M0000562185,-48;M0000630779,-68;M0000075828,-6...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,
2592,1203801,S03054,119.357350,26.043063,"M0000206579,-55;M0000772175,-57;M0000010195,-6...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,
2593,1204490,S03054,119.357595,26.042855,"M0000670552,-72;M0000770093,-73;M0000065922,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,


In [20]:
lbl = LabelEncoder()
lbl.fit(list(sub_train['shop_id'].values))

LabelEncoder()

In [22]:
sub_train['label'] = lbl.transform(list(sub_train['shop_id'].values))

In [23]:
sub_train

Unnamed: 0,index,shop_id,longitude_x,latitude_x,wifi_infos,build_id,category_id,layer,hot,longitude_y,...,788,789,790,791,792,793,794,795,796,label
0,1,S03097,119.357683,26.043137,"M0000192366,-86;M0000736040,-71;M0000787437,-7...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,1
1,1122,S03054,119.357949,26.042583,"M0000206579,-72;M0000010195,-73;M0000920385,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,0
2,1459,S03054,119.357094,26.042793,"M0000778589,-65;M0000206579,-68;M0000104914,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,0
3,1608,S03097,119.357854,26.043286,"M0000911235,-58;M0000923071,-59;M0000789568,-6...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,1
4,1765,S03097,119.357745,26.043318,"M0000562185,-49;M0000630779,-68;M0000075828,-7...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2590,1201843,S03054,119.357450,26.042749,"M0000398830,-73;M0000504132,-77;M0000627902,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,0
2591,1202697,S03097,119.357781,26.043496,"M0000562185,-48;M0000630779,-68;M0000075828,-6...",B00098,C00004,1.0,7.0,119.357681,...,,,,,,,,,,1
2592,1203801,S03054,119.357350,26.043063,"M0000206579,-55;M0000772175,-57;M0000010195,-6...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,0
2593,1204490,S03054,119.357595,26.042855,"M0000670552,-72;M0000770093,-73;M0000065922,-7...",B00098,C00004,2.0,4.0,119.357457,...,,,,,,,,,,0


In [24]:
num_class = sub_train['label'].max() + 1

In [41]:
feature = [x for x in sub_train.columns if x not in ['index','label', 'shop_id', 'wifi_infos','category_id', 'category_name', 'build_id', 'row_id']]

In [60]:
params = {
    'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eta': 0.1,
        'max_depth': 8,
        'eval_metric': 'auc',
        'seed': 0,
        'silent': 1
    }

In [48]:
sub_train[feature].shape, sub_train['label'].shape

((2595, 803), (2595,))

In [52]:
X = sub_train[feature]
Y = sub_train['label']
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
xgbtrain = xgb.DMatrix(X_train, Y_train)
xgbtest = xgb.DMatrix(X_test, Y_test)



In [62]:
# watchlist = [(xgbtrain, 'train'), (xgbtrain, 'test')]
model = xgb.cv(params, xgbtrain, num_boost_round=100, verbose_eval=True)

[0]	train-auc:1+0	test-auc:1+0
[1]	train-auc:1+0	test-auc:1+0
[2]	train-auc:1+0	test-auc:1+0
[3]	train-auc:1+0	test-auc:1+0
[4]	train-auc:1+0	test-auc:1+0
[5]	train-auc:1+0	test-auc:1+0
[6]	train-auc:1+0	test-auc:1+0
[7]	train-auc:1+0	test-auc:1+0
[8]	train-auc:1+0	test-auc:1+0
[9]	train-auc:1+0	test-auc:1+0
[10]	train-auc:1+0	test-auc:1+0
[11]	train-auc:1+0	test-auc:1+0
[12]	train-auc:1+0	test-auc:1+0
[13]	train-auc:1+0	test-auc:1+0
[14]	train-auc:1+0	test-auc:1+0
[15]	train-auc:1+0	test-auc:1+0
[16]	train-auc:1+0	test-auc:1+0
[17]	train-auc:1+0	test-auc:1+0
[18]	train-auc:1+0	test-auc:1+0
[19]	train-auc:1+0	test-auc:1+0
[20]	train-auc:1+0	test-auc:1+0
[21]	train-auc:1+0	test-auc:1+0
[22]	train-auc:1+0	test-auc:1+0
[23]	train-auc:1+0	test-auc:1+0
[24]	train-auc:1+0	test-auc:1+0
[25]	train-auc:1+0	test-auc:1+0
[26]	train-auc:1+0	test-auc:1+0
[27]	train-auc:1+0	test-auc:1+0
[28]	train-auc:1+0	test-auc:1+0
[29]	train-auc:1+0	test-auc:1+0
[30]	train-auc:1+0	test-auc:1+0
[31]	train-auc:1+0