In [19]:
import pandas as pd

In [20]:
# 1、获取数据
data = pd.read_csv("train.csv")

In [21]:
data

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949
...,...,...,...,...,...,...
29118016,29118016,6.5133,1.1435,67,399740,8671361106
29118017,29118017,5.9186,4.4134,67,125480,9077887898
29118018,29118018,2.9993,6.3680,67,737758,2838334300
29118019,29118019,4.0637,8.0061,70,764975,1007355847


In [22]:
# 2、基本的数据处理
# 1) 缩小数据范围
data = data.query("x < 2.5 & x > 2 & y <1.5 & y > 1.0")

In [23]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
112,112,2.236,1.3655,66,623174,7663031065
180,180,2.2003,1.2541,65,610195,2358558474
367,367,2.4108,1.3213,74,579667,6644108708
874,874,2.0822,1.1973,320,143566,3229876087
1022,1022,2.016,1.1659,65,207993,3244363975


In [24]:
# 2) 处理时间特征
time_value = pd.to_datetime(data["time"], unit = "s")

In [25]:
date = pd.DatetimeIndex(time_value)   

In [26]:
data["day"] = date.day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["day"] = date.day


In [27]:
data["weekday"] = date.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["weekday"] = date.weekday


In [28]:
data["hour"] = date.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["hour"] = date.hour


In [29]:
data

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.2360,1.3655,66,623174,7663031065,8,3,5
180,180,2.2003,1.2541,65,610195,2358558474,8,3,1
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.0160,1.1659,65,207993,3244363975,3,5,9
...,...,...,...,...,...,...,...,...,...
29115112,29115112,2.1889,1.2914,168,721885,4606837364,9,4,8
29115204,29115204,2.1193,1.4692,58,563389,2074133146,7,2,12
29115338,29115338,2.0007,1.4852,25,765986,6691588909,9,4,20
29115464,29115464,2.4132,1.4237,61,151918,7396159924,2,4,18


In [30]:
# 3) 过滤签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]

In [31]:
place_count[place_count > 3]

place_id
1014605271     28
1015645743      4
1017236154     31
1024951487      5
1028119817      4
             ... 
9936666116    140
9954155328      8
9980625005     16
9994257798     25
9996671132     18
Name: row_id, Length: 950, dtype: int64

In [32]:
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]

In [33]:
# 筛选特征值和目标值
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [34]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [37]:
# 3、特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、KNN算法预估器
# k值一般不适用偶数
estimator = KNeighborsClassifier()

# 参数准备
param_dict = {"n_neighbors": [3, 5, 7, 9]}
# 加入网格搜索和交叉验证
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)

estimator.fit(x_train, y_train)
# 5、模型评估
# (1)、直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print(y_test == y_predict)
# (2)、计算准确率
# 求的是测试集的准确率
score = estimator.score(x_test, y_test)
print(score)

print(estimator.best_estimator_)
print(estimator.best_params_)
# 求的是交叉验证时验证集的最佳准确率
print(estimator.best_score_)
print(estimator.cv_results_)



6546590     False
21276964    False
23018876     True
4390870     False
5949016     False
            ...  
1093875      True
2024451      True
26055801    False
28485493     True
15300300    False
Name: place_id, Length: 20228, dtype: bool
0.3676092544987147
KNeighborsClassifier()
{'n_neighbors': 5}
0.33456381727794254
{'mean_fit_time': array([0.03057003, 0.03266398, 0.02920198, 0.02576232]), 'std_fit_time': array([0.00501919, 0.01009923, 0.00293669, 0.00084063]), 'mean_score_time': array([0.67286785, 0.80601621, 0.90528019, 0.88514527]), 'std_score_time': array([0.02866552, 0.07968677, 0.02581013, 0.02380979]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': array([0.31693692, 0.33300376, 0.33280601, 0.33142179]), 'split1_test_score': array([0.32506056, 0.33544273, 0.3353