In [50]:
import pandas as pd


In [69]:
data = pd.read_csv('data/FBlocation/train.csv')
data.shape


(29118021, 6)

In [68]:
data[data['x']<3].count()

row_id      8698250
x           8698250
y           8698250
accuracy    8698250
time        8698250
place_id    8698250
day         8698250
weekday     8698250
hour        8698250
dtype: int64

## 基本数据处理
### 1、缩小数据范围

In [18]:
data = data.query('x<2.5&x>2&y<1.5&y>1')

### 2、处理时间特征

In [53]:
time_value = pd.to_datetime(data['time'],unit='s')
date = pd.DatetimeIndex(time_value)
date

DatetimeIndex(['1970-01-06 10:45:02', '1970-01-03 03:49:15',
               '1970-01-04 17:37:28', '1970-01-09 03:43:07',
               '1970-01-06 11:08:50', '1970-01-03 01:27:45',
               '1970-01-08 17:13:49', '1970-01-05 06:30:02',
               '1970-01-02 22:13:04', '1970-01-05 15:07:40',
               ...
               '1970-01-02 05:57:27', '1970-01-07 02:44:50',
               '1970-01-06 22:33:58', '1970-01-01 11:47:22',
               '1970-01-07 08:40:17', '1970-01-05 15:02:20',
               '1970-01-02 10:51:20', '1970-01-09 12:55:58',
               '1970-01-09 20:29:35', '1970-01-02 04:34:02'],
              dtype='datetime64[ns]', name='time', length=29118021, freq=None)

In [24]:
date.weekday

Int64Index([3, 3, 2, 4, 5, 1, 1, 4, 6, 2,
            ...
            4, 3, 4, 4, 6, 4, 2, 4, 4, 3],
           dtype='int64', name='time', length=83197)

In [54]:
data['day'] = date.day

In [55]:
data.loc[:,'weekday'] = date.weekday

In [56]:
data['hour'] = date.hour

In [57]:
data

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
0,0,0.7941,9.0809,54,470702,8523065625,6,1,10
1,1,5.9567,4.7968,13,186555,1757726713,3,5,3
2,2,8.3078,7.0407,74,322648,1137537235,4,6,17
3,3,7.3665,2.5165,65,704587,6567393236,9,4,3
4,4,4.0961,1.1307,31,472130,7440663949,6,1,11
...,...,...,...,...,...,...,...,...,...
29118016,29118016,6.5133,1.1435,67,399740,8671361106,5,0,15
29118017,29118017,5.9186,4.4134,67,125480,9077887898,2,4,10
29118018,29118018,2.9993,6.3680,67,737758,2838334300,9,4,12
29118019,29118019,4.0637,8.0061,70,764975,1007355847,9,4,20


### 过滤掉签到较少的点

In [58]:
placecount = data.groupby('place_id').count()['row_id']
placecount[placecount >3]

place_id
1000015801     78
1000017288     95
1000025138    563
1000052096    961
1000063498     60
             ... 
9999851158     60
9999855083    212
9999862567     63
9999916757    508
9999932225    218
Name: row_id, Length: 107814, dtype: int64

In [59]:
data_final = data[data['place_id'].isin(placecount[placecount >3].index.values)]
data_final

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
0,0,0.7941,9.0809,54,470702,8523065625,6,1,10
1,1,5.9567,4.7968,13,186555,1757726713,3,5,3
2,2,8.3078,7.0407,74,322648,1137537235,4,6,17
3,3,7.3665,2.5165,65,704587,6567393236,9,4,3
4,4,4.0961,1.1307,31,472130,7440663949,6,1,11
...,...,...,...,...,...,...,...,...,...
29118016,29118016,6.5133,1.1435,67,399740,8671361106,5,0,15
29118017,29118017,5.9186,4.4134,67,125480,9077887898,2,4,10
29118018,29118018,2.9993,6.3680,67,737758,2838334300,9,4,12
29118019,29118019,4.0637,8.0061,70,764975,1007355847,9,4,20


### 筛选特征值和目标值

In [60]:
x = data_final[['x','y','day','weekday','hour']]
y = data_final['place_id']

In [61]:
x.head()

Unnamed: 0,x,y,day,weekday,hour
0,0.7941,9.0809,6,1,10
1,5.9567,4.7968,3,5,3
2,8.3078,7.0407,4,6,17
3,7.3665,2.5165,9,4,3
4,4.0961,1.1307,6,1,11


In [66]:
x.corr()

Unnamed: 0,x,y,day,weekday,hour
x,1.0,-0.00034,-0.000668,0.000204,0.000494
y,-0.00034,1.0,0.00065,-0.000817,-9.5e-05
day,-0.000668,0.00065,1.0,-0.173775,-0.038269
weekday,0.000204,-0.000817,-0.173775,1.0,0.002767
hour,0.000494,-9.5e-05,-0.038269,0.002767,1.0


In [62]:
y.head()

0    8523065625
1    1757726713
2    1137537235
3    6567393236
4    7440663949
Name: place_id, dtype: int64

# 数据集划分

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


In [64]:
x_train,x_test,y_train,y_text = train_test_split(x,y)

In [65]:

# 3）特征工程, 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)#用训练集的fit来对测试集进行标准化
# 4）KNN预估器流程
estimator = KNeighborsClassifier()#n_neighbors即k值在后面调优，这里不设置

#加入网格搜索与k值调优
#z准备参数
param_dict = {'n_neighbors':[1,3,5,7,9]}
estimator = GridSearchCV(estimator, param_grid=param_dict,cv=3)#3指3折调优
estimator.fit(x_train,y_train)
# 5）模型评估
#方法1，直接比对
y_predict = estimator.predict(x_test)
print('y_predict\n',y_predict)
print('直接比对真实值和预测值\n',y_test == y_predict)
#方法2，计算准确率
score = estimator.score(x_test,y_test)
print('准确率为：\n',score)

#查看调优结果
#最佳参数
print('最佳参数：\n',estimator.best_params_)
#最佳结果
print('最佳结果:\n',estimator.best_score_)
#最佳估计器
print('最佳估计器：\n',estimator.best_estimator_)
#交叉验证结果
print('交叉验证结果:\n',estimator.cv_results_)



KeyboardInterrupt: 