In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# 具体步骤：
# 1.获取数据集
# 2.基本数据处理
# 2.1 缩小数据范围
# 2.2 选择时间特征
# 2.3 去掉签到较少的地方
# 2.4 确定特征值和目标值
# 2.5 分割数据集
# 3.特征工程 -- 特征预处理(标准化)
# 4.机器学习 -- knn+cv
# 5.模型评估

In [2]:
# 1.获取数据集
data = pd.read_csv('./data/FBlocation/train.csv')

In [3]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [4]:
data.describe()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0
mean,14559010.0,4.99977,5.001814,82.84912,417010.4,5493787000.0
std,8405649.0,2.857601,2.887505,114.7518,231176.1,2611088000.0
min,0.0,0.0,0.0,1.0,1.0,1000016000.0
25%,7279505.0,2.5347,2.4967,27.0,203057.0,3222911000.0
50%,14559010.0,5.0091,4.9883,62.0,433922.0,5518573000.0
75%,21838520.0,7.4614,7.5103,75.0,620491.0,7764307000.0
max,29118020.0,10.0,10.0,1033.0,786239.0,9999932000.0


In [5]:
data.shape

(29118021, 6)

In [8]:
# 2.基本数据处理
# 2.1 缩小数据范围
partial_data = data.query('x>2.0 & x<2.5 & y>2.0 & y<2.5')

In [9]:
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
163,163,2.1663,2.3755,84,669737,3869813743
310,310,2.3695,2.2034,3,234719,2636621520
658,658,2.3236,2.1768,66,502343,7877745055
1368,1368,2.2613,2.3392,73,319822,9775192577
1627,1627,2.3331,2.0011,66,595084,6731326909


In [10]:
partial_data.shape

(71664, 6)

In [11]:
# 2.2 选择时间特征
partial_data['time'].head()

163     669737
310     234719
658     502343
1368    319822
1627    595084
Name: time, dtype: int64

In [12]:
#  指定秒数
time = pd.to_datetime(partial_data['time'], unit='s')
# 脱敏
time.head()

163    1970-01-08 18:02:17
310    1970-01-03 17:11:59
658    1970-01-06 19:32:23
1368   1970-01-04 16:50:22
1627   1970-01-07 21:18:04
Name: time, dtype: datetime64[ns]

In [13]:
time = pd.DatetimeIndex(time)
time

DatetimeIndex(['1970-01-08 18:02:17', '1970-01-03 17:11:59',
               '1970-01-06 19:32:23', '1970-01-04 16:50:22',
               '1970-01-07 21:18:04', '1970-01-02 03:14:59',
               '1970-01-07 03:45:16', '1970-01-05 03:28:43',
               '1970-01-01 18:59:03', '1970-01-09 07:50:12',
               ...
               '1970-01-09 20:03:34', '1970-01-08 09:26:50',
               '1970-01-07 04:45:59', '1970-01-07 22:36:18',
               '1970-01-06 23:29:43', '1970-01-03 12:31:26',
               '1970-01-04 15:19:20', '1970-01-01 20:49:14',
               '1970-01-03 09:17:37', '1970-01-02 20:34:43'],
              dtype='datetime64[ns]', name='time', length=71664, freq=None)

In [14]:
time.hour

Int64Index([18, 17, 19, 16, 21,  3,  3,  3, 18,  7,
            ...
            20,  9,  4, 22, 23, 12, 15, 20,  9, 20],
           dtype='int64', name='time', length=71664)

In [22]:
partial_data['hour'] = time.hour
partial_data['day'] = time.day
partial_data['weekday'] = time.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
163,163,2.1663,2.3755,84,669737,3869813743
310,310,2.3695,2.2034,3,234719,2636621520
658,658,2.3236,2.1768,66,502343,7877745055
1368,1368,2.2613,2.3392,73,319822,9775192577
1627,1627,2.3331,2.0011,66,595084,6731326909


In [16]:
# 2.3 去掉签到较少的地方
place_count = partial_data.groupby('place_id').count()

In [17]:
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1006234733,1,1,1,1,1
1008823061,4,4,4,4,4
1012580558,3,3,3,3,3
1025585791,21,21,21,21,21
1026507711,220,220,220,220,220


In [18]:
place_count = place_count[place_count['row_id']>3]
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1008823061,4,4,4,4,4
1025585791,21,21,21,21,21
1026507711,220,220,220,220,220
1032417180,10,10,10,10,10
1040557418,123,123,123,123,123


In [19]:
partial_data['place_id'].isin(place_count.index)

163         True
310         True
658         True
1368        True
1627        True
            ... 
29116142    True
29116267    True
29116295    True
29116475    True
29117203    True
Name: place_id, Length: 71664, dtype: bool

In [None]:
# 2.基本数据处理
# 2.1 缩小数据范围
# 2.2 选择时间特征
# 2.3 去掉签到较少的地方
# 2.4 确定特征值和目标值
# 2.5 分割数据集
# 3.特征工程 -- 特征预处理(标准化)
# 4.机器学习 -- knn+cv
# 5.模型评估