# WIFI kNN

根据wifi的kNN距离来计算相似度，然后选择topK

In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
shop_info = pd.read_csv('../data/ccf_first_round_shop_info.csv')
shop_to_mall = shop_info[['shop_id','mall_id']]
shop_to_mall.head()

Unnamed: 0,shop_id,mall_id
0,s_26,m_690
1,s_133,m_6587
2,s_251,m_5892
3,s_372,m_625
4,s_456,m_3839


In [3]:
user_shop_hehavior = pd.read_csv('../data/ccf_first_round_user_shop_behavior.csv')
user_shop_hehavior = pd.merge(user_shop_hehavior,shop_to_mall,on='shop_id',how='left')
user_shop_hehavior.head()

Unnamed: 0,user_id,shop_id,time_stamp,longitude,latitude,wifi_infos,mall_id
0,u_376,s_2871718,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...,m_1409
1,u_376,s_2871718,2017-08-06 21:20,122.308162,32.08797,b_6396480|-67|false;b_56328155|-73|false;b_411...,m_1409
2,u_1041,s_181637,2017-08-02 13:10,117.365255,40.638214,b_8006367|-78|false;b_2485110|-52|false;b_3005...,m_4079
3,u_1158,s_609470,2017-08-13 12:30,121.134451,31.197416,b_26250579|-73|false;b_26250580|-64|false;b_26...,m_6587
4,u_1654,s_3816766,2017-08-25 19:50,122.255867,31.35132,b_39004150|-66|false;b_39004148|-58|false;b_21...,m_3005


In [4]:
evalution = pd.read_csv('../data/evaluation_public.csv')
evalution.head()

Unnamed: 0,row_id,user_id,mall_id,time_stamp,longitude,latitude,wifi_infos
0,118742,u_30097142,m_3916,2017-09-05 13:00,122.141011,39.818847,b_34366982|-82|false;b_37756289|-53|false;b_41...
1,118743,u_30097803,m_5085,2017-09-06 13:10,118.191907,32.855858,b_36722251|-81|false;b_10537579|-75|false;b_43...
2,118744,u_30097889,m_4033,2017-09-06 17:40,119.19211,32.424667,b_30026291|-74|false;b_30026290|-74|false;b_36...
3,118745,u_30098996,m_4515,2017-09-03 12:10,120.612201,34.055249,b_33412374|-77|false;b_22084893|-86|false;b_52...
4,118746,u_30099170,m_7168,2017-09-02 20:40,116.861989,40.326858,b_19882704|-77|false;b_2241462|-49|false;b_585...


In [5]:
from pprint import pprint

In [20]:
train = defaultdict(lambda:[])
target = defaultdict(lambda:[])
for line in user_shop_hehavior.values:
    target[line[6]].append(line[1])
    # 加100使信号都变成正数
    train[line[6]].append(dict([(a.split('|')[0],int(a.split('|')[1])+100) for a in line[5].split(';')]))


In [6]:
test = []
test_mall_id = []
test_row_id = []
for line in evalution.values:
    test.append(dict([(a.split('|')[0],int(a.split('|')[1])+100) for a in line[6].split(';')]))
    test_mall_id.append(line[2])
    test_row_id.append(line[0])

## 去重复

In [7]:
train_wifi_set = user_shop_hehavior.wifi_infos.map(lambda x:[a.split('|')[0] for a in x.split(';')])
train_wifi_set = set([b for a in train_wifi_set for b in a])

In [8]:
test_wifi_set = evalution.wifi_infos.map(lambda x:[a.split('|')[0] for a in x.split(';')])
test_wifi_set = set([b for a in test_wifi_set for b in a])

In [9]:
len(train_wifi_set)

399679

In [10]:
len(test_wifi_set)

224254

In [11]:
public_wifi_set = set([a for a in train_wifi_set if a in test_wifi_set])
len(public_wifi_set)

121209

In [12]:
def mode1(wifi_dict1,wifi_dict2):
    wifi_ssid_all = set(list(wifi_dict1.keys()) + list(wifi_dict1.keys()))
    distance = 0
    for wifi_ssid in wifi_ssid_all:
        # 只计算公共wifi的距离
        if wifi_ssid in public_wifi_set:
            # 指定键值不存在则返回默认值为-100
            distance += (wifi_dict1.get(wifi_ssid,-100) - wifi_dict2.get(wifi_ssid,-100))**2   # 欧式距离
            # 曼哈顿距离
#             distance += abs((wifi_dict1.get(wifi_ssid,-100) - wifi_dict2.get(wifi_ssid,-100)))
    distance = (distance)**0.5
    return distance

## 输出

In [13]:
topK = 5   # 取前五名
preds = []
all_count = 0
for mall_id, wifi_dict1 in zip(test_mall_id, test):
    distances = []
    for real, wifi_dict2 in zip(target[mall_id],train[mall_id]): # 对于每个已知的真实值
        # 算出shop_id对应的距离
        distances.append([real, mode1(wifi_dict1,wifi_dict2)])
    # 取前五
    topK_result = [a[0] for a in sorted(distances, key=lambda x:x[1], reverse=False)[:topK]]
    counter = defaultdict(lambda:0)
    # 统计次数
    for a in topK_result:
        counter[a] += 1
    # 取次数出现最多的那个作为预测的shop_id
    pred = sorted(counter.items(),key=lambda x:x[1],reverse=True)[0][0]
    preds.append(pred)
    # 计算次数
    all_count += 1
    if all_count % 1000 == 0:
        print(all_count)
print(all_count)
result = pd.DataFrame({'row_id':test_row_id,'shop_id':preds})
result.to_csv('wifi_kNN_with_public_wifi.csv',index=None)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000


KeyboardInterrupt: 