In [1]:
import pandas as pd

In [2]:
# 获取数据
data = pd.read_csv("./FBlocation/train.csv")

In [3]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [4]:
# 进行基本数据处理
# 1 缩小数据范围
data = data.query("x<2.5 & x>2 & y<1.5 & y>1")

In [5]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
112,112,2.236,1.3655,66,623174,7663031065
180,180,2.2003,1.2541,65,610195,2358558474
367,367,2.4108,1.3213,74,579667,6644108708
874,874,2.0822,1.1973,320,143566,3229876087
1022,1022,2.016,1.1659,65,207993,3244363975


In [6]:
# 2 处理时间特征
time_value = pd.to_datetime(data["time"],unit="s")

In [7]:
date = pd.DatetimeIndex(time_value)

In [8]:
data["day"] = date.day

In [9]:
data["weekday"] =  date.weekday

In [10]:
data["hour"] =  date.hour

In [11]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
180,180,2.2003,1.2541,65,610195,2358558474,8,3,1
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9


In [12]:
# 3 过滤掉签到次数比较少的地点
place_count = data.groupby("place_id").count()["row_id"]

In [13]:
place_count[place_count > 3].head() # 使用布尔索引去过滤

place_id
1014605271    28
1015645743     4
1017236154    31
1024951487     5
1028119817     4
Name: row_id, dtype: int64

In [14]:
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]  # 先找到索引存在的位置 之后再对data进行布尔索引

In [15]:
# 筛选特征值和目标值
x = data_final[["x","y","accuracy","day","weekday","hour"]]
y = data_final["place_id"]

In [16]:
x.head()

Unnamed: 0,x,y,accuracy,day,weekday,hour
112,2.236,1.3655,66,8,3,5
367,2.4108,1.3213,74,7,2,17
874,2.0822,1.1973,320,2,4,15
1022,2.016,1.1659,65,3,5,9
1045,2.3859,1.166,498,6,1,19


In [17]:
y.head()

112     7663031065
367     6644108708
874     3229876087
1022    3244363975
1045    6438240873
Name: place_id, dtype: int64

In [18]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [21]:
# 特征工程-标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) 

estimator = KNeighborsClassifier()

param_dict = {"n_neighbors":[3,5,7,9]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)


estimator.fit(x_train, y_train)

y_predict = estimator.predict(x_test)
print("预测值为：", y_predict)
print("比对真实值和预测值：", y_test == y_predict)

score = estimator.score(x_test, y_test)
print("准确率为：", score)

# 最佳参数
print("最佳参数：", estimator.best_params_)
# 最佳结果
print("最佳结果：", estimator.best_score_)
# 最佳估计器
print("最佳估计器：", estimator.best_estimator_)
# 交叉验证结果
print("交叉验证结果：", estimator.cv_results_)



预测值为： [3157530019 6054985864 6438240873 ... 9764078387 1841689795 3974542317]
比对真实值和预测值： 12957121    False
768704      False
27375296     True
4471218     False
10575308     True
            ...  
8806972     False
1875223      True
19891370     True
21788485    False
12750702    False
Name: place_id, Length: 20228, dtype: bool
准确率为： 0.3672631995254103
最佳参数： {'n_neighbors': 5}
最佳结果： 0.33520651605529633
最佳估计器： KNeighborsClassifier()
交叉验证结果： {'mean_fit_time': array([0.06320937, 0.06100424, 0.06190825, 0.06120269]), 'std_fit_time': array([0.00278718, 0.00189133, 0.00161086, 0.00036386]), 'mean_score_time': array([0.83296545, 0.87193847, 0.92566093, 0.94990158]), 'std_score_time': array([0.00320089, 0.01312203, 0.00741148, 0.00107642]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value=999999), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': array([0.3230176