In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 项目描述

> 本次比赛的目的是预测一个人将要签到的地方。 为了本次比赛，Facebook创建了一个虚拟世界，其中包括10公里*10公里共100平方公里的约10万个地方。 对于给定的坐标集，您的任务将根据用户的位置，准确性和时间戳等预测用户下一次的签到位置。 数据被制作成类似于来自移动设备的位置数据。 请注意：您只能使用提供的数据进行预测。


website:https://www.kaggle.com/c/facebook-v-predicting-check-ins


- 1.获取数据集
- 2.基本数据处理
    - 2.1 缩小数据范围
    - 2.2 选择时间特征
    - 2.3 去掉签到较少的地方
    - 2.4 确定特征值和目标值
    - 2.5 分割数据集
- 3.特征工程 -- 特征预处理(标准化)
- 4.机器学习 -- knn+cv
- 5.模型评估



In [2]:
data = pd.read_csv("../data/FBlocation/train.csv")
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [3]:
data.describe()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0
mean,14559010.0,4.99977,5.001814,82.84912,417010.4,5493787000.0
std,8405649.0,2.857601,2.887505,114.7518,231176.1,2611088000.0
min,0.0,0.0,0.0,1.0,1.0,1000016000.0
25%,7279505.0,2.5347,2.4967,27.0,203057.0,3222911000.0
50%,14559010.0,5.0091,4.9883,62.0,433922.0,5518573000.0
75%,21838520.0,7.4614,7.5103,75.0,620491.0,7764307000.0
max,29118020.0,10.0,10.0,1033.0,786239.0,9999932000.0


In [5]:
data.shape

(29118021, 6)

In [6]:
# 2.基本数据处理
# 2.1 缩小数据范围,否则速度太慢演示
partial_data = data.query('x > 2 & x < 2.5 & y > 2 & y < 2.5')
partial_data.shape

(71664, 6)

In [7]:
partial_data['time'].head()

163     669737
310     234719
658     502343
1368    319822
1627    595084
Name: time, dtype: int64

In [9]:
#这时候 time 是一个 series,无法调用其内的属性,必须转成DatetimeIndex 属性,unit=s,意思是319822这个数据的单位是秒
time = pd.to_datetime(partial_data['time'], unit='s')
print(type(time))
time = pd.DatetimeIndex(time)
time

<class 'pandas.core.series.Series'>


DatetimeIndex(['1970-01-08 18:02:17', '1970-01-03 17:11:59',
               '1970-01-06 19:32:23', '1970-01-04 16:50:22',
               '1970-01-07 21:18:04', '1970-01-02 03:14:59',
               '1970-01-07 03:45:16', '1970-01-05 03:28:43',
               '1970-01-01 18:59:03', '1970-01-09 07:50:12',
               ...
               '1970-01-09 20:03:34', '1970-01-08 09:26:50',
               '1970-01-07 04:45:59', '1970-01-07 22:36:18',
               '1970-01-06 23:29:43', '1970-01-03 12:31:26',
               '1970-01-04 15:19:20', '1970-01-01 20:49:14',
               '1970-01-03 09:17:37', '1970-01-02 20:34:43'],
              dtype='datetime64[ns]', name='time', length=71664, freq=None)

In [10]:
partial_data['hour'] = time.hour
partial_data['day'] = time.day
partial_data['weekday'] = time.weekday
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,day,weekday
163,163,2.1663,2.3755,84,669737,3869813743,18,8,3
310,310,2.3695,2.2034,3,234719,2636621520,17,3,5
658,658,2.3236,2.1768,66,502343,7877745055,19,6,1
1368,1368,2.2613,2.3392,73,319822,9775192577,16,4,6
1627,1627,2.3331,2.0011,66,595084,6731326909,21,7,2


In [12]:
#删除那些访问次数太少的地方
place_count = partial_data.groupby('place_id').count()
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,hour,day,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1006234733,1,1,1,1,1,1,1,1
1008823061,4,4,4,4,4,4,4,4
1012580558,3,3,3,3,3,3,3,3
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220


In [13]:
#只选择哪个出现次数大于 3 的地方
place_count = place_count.query('x>3')
# place_count = place_count[place_count['x']>3]
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,hour,day,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1008823061,4,4,4,4,4,4,4,4
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220
1032417180,10,10,10,10,10,10,10,10
1040557418,123,123,123,123,123,123,123,123


In [17]:
partial_data['place_id'].isin(place_count.index).head()

163     True
310     True
658     True
1368    True
1627    True
Name: place_id, dtype: bool

In [18]:
partial_data = partial_data[partial_data['place_id'].isin(place_count.index)]
partial_data.shape

(69264, 9)

In [19]:
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,day,weekday
163,163,2.1663,2.3755,84,669737,3869813743,18,8,3
310,310,2.3695,2.2034,3,234719,2636621520,17,3,5
658,658,2.3236,2.1768,66,502343,7877745055,19,6,1
1368,1368,2.2613,2.3392,73,319822,9775192577,16,4,6
1627,1627,2.3331,2.0011,66,595084,6731326909,21,7,2


In [20]:
# 确认目标值和特征
x = partial_data[["x", "y", "accuracy", "hour", "day", "weekday"]]
y = partial_data['place_id']
x.head()

Unnamed: 0,x,y,accuracy,hour,day,weekday
163,2.1663,2.3755,84,18,8,3
310,2.3695,2.2034,3,17,3,5
658,2.3236,2.1768,66,19,6,1
1368,2.2613,2.3392,73,16,4,6
1627,2.3331,2.0011,66,21,7,2


In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.25)
x_train.head()

Unnamed: 0,x,y,accuracy,hour,day,weekday
5932039,2.2994,2.1828,150,3,8,3
25485938,2.4097,2.1293,59,0,3,5
3337452,2.4564,2.3449,64,13,7,2
16219064,2.101,2.1215,33,7,5,0
13266403,2.4985,2.3149,68,16,4,6


In [47]:
#特征工程,标准化,结果是 numpy.array类型
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)


In [48]:
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7, 9]}
#n_jobs:使用 cpu 几个核
estimator = GridSearchCV(estimator=estimator, param_grid = param_grid, cv=10, n_jobs = 4)
estimator.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_neighbors': [3, 5, 7, 9]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [37]:
score_train = estimator.score(x_train, y_train)
score_train

0.5531108031108031

In [38]:
score_test = estimator.score(x_test, y_test)
score_test

0.37115962115962114

In [39]:
y_pred = estimator.predict(x_test)
y_pred

array([1207701286, 4372969211, 2636621520, ..., 4980449560, 3539133103,
       1553752228])

In [40]:
# 5.3 其他结果输出
print("最好的模型是:\n", estimator.best_estimator_)
print("最好的结果是:\n", estimator.best_score_)
print("所有的结果是:\n", estimator.cv_results_)

最好的模型是:
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
最好的结果是:
 0.3644991144991145
所有的结果是:
 {'mean_fit_time': array([0.07369885, 0.07049057, 0.06985605, 0.10535872]), 'std_fit_time': array([0.00479042, 0.00158252, 0.00209449, 0.02073307]), 'mean_score_time': array([0.24002621, 0.28908081, 0.30812411, 0.55429609]), 'std_score_time': array([0.02195756, 0.04970395, 0.01231399, 0.10705452]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': array([0.33834986, 0.34950425, 0.33711048, 0.33162181]), 'split1_test_score': array([0.33333333, 0.34301596, 0.33978842, 0.34068496]), 'split2_test_score': array([0.33758658, 0.3558148 , 0.35180459, 0.34524244]), 'split3_test_s