In [5]:
# 基础数据处理库
import pandas as pd
import numpy as np

# 数据集加载
from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing

# 模型选择与数据分割
from sklearn.model_selection import train_test_split, GridSearchCV

# 特征工程 (预处理与特征提取)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

# 各类算法模型
from sklearn.neighbors import KNeighborsClassifier  # KNN
from sklearn.naive_bayes import MultinomialNB       # 朴素贝叶斯
from sklearn.tree import DecisionTreeClassifier, export_graphviz # 决策树
from sklearn.ensemble import RandomForestClassifier # 随机森林

# 模型评估指标
from sklearn.metrics import classification_report, roc_auc_score

In [8]:
li = load_iris()
x,y=li.data,li.target

# # 进行数据的分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# # 特征工程（标准化），下面3行注释，一开始我们不进行标准化，看下效果，目标值要不要标准化？
std = StandardScaler()
# #
# # # 对测试集和训练集的特征值进行标准化，服务于knn fit
x_train = std.fit_transform(x_train)

# # transform返回的是copy，不在原本的输入对象中去修改
# print(id(x_test))
print(std.mean_)
print(std.var_)

x_test = std.transform(x_test)  # transfrom不再进行均值和方差的计算，是在原有的基础上去标准化
print('-' * 50)
# print(id(x_test))
print(std.mean_)
print(std.var_)

[5.83035714 3.025      3.79464286 1.20803571]
[0.69461416 0.174375   3.14050702 0.591989  ]
--------------------------------------------------
[5.83035714 3.025      3.79464286 1.20803571]
[0.69461416 0.174375   3.14050702 0.591989  ]


In [11]:
import pandas as pd

#knn

# 修改后的写法：
# ../ 表示返回上一级目录，然后进入 day14 找 data
data = pd.read_csv("../day14/data/FBlocation/train.csv")

# 数据读取后进行后续操作
print("数据加载成功！")
print(data.head())
print(data.shape)
print(data.info())
# 你的筛选代码保持不变
data = data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75")

数据加载成功！
   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949
(29118021, 6)
<class 'pandas.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 6 columns):
 #   Column    Dtype  
---  ------    -----  
 0   row_id    int64  
 1   x         float64
 2   y         float64
 3   accuracy  int64  
 4   time      int64  
 5   place_id  int64  
dtypes: float64(2), int64(4)
memory usage: 1.3 GB
None


In [12]:
data.describe()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0
mean,14505690.0,1.122538,2.632309,82.482101,397551.263128,5129895000.0
std,8353805.0,0.077086,0.070144,113.613227,234601.097883,2357399000.0
min,600.0,1.0001,2.5001,1.0,119.0,1012024000.0
25%,7327816.0,1.0492,2.5738,25.0,174069.75,3312464000.0
50%,14430710.0,1.1233,2.6423,62.0,403387.5,5261906000.0
75%,21634630.0,1.1905,2.6878,75.0,602111.75,6766325000.0
max,29112150.0,1.2499,2.7499,1004.0,786218.0,9980711000.0


In [13]:
time_value=pd.to_datetime(data['time'],unit='s')
print(time_value.head())

600    1970-01-01 18:09:40
957    1970-01-10 02:11:10
4345   1970-01-05 15:08:02
4735   1970-01-06 23:03:03
5580   1970-01-09 11:26:50
Name: time, dtype: datetime64[s]


In [15]:
# 1. 将时间戳转换为日期格式
# 2. 【关键】使用 pd.DatetimeIndex 把它变成索引格式
# 这样你后面就可以直接用 .day, .hour, .weekday 了，而不需要加 .dt
time_value = pd.DatetimeIndex(pd.to_datetime(data['time'], unit='s'))

# 检查一下转换是否成功
print("转换成功，现在 time_value 的类型是：", type(time_value))

转换成功，现在 time_value 的类型是： <class 'pandas.DatetimeIndex'>


In [16]:
print('-' * 50)
# 构造一些特征，执行的警告是因为我们的操作是复制，loc是直接放入
print(type(data))
# data['day'] = time_value.day
# data['hour'] = time_value.hour
# data['weekday'] = time_value.weekday

# 日期，是否是周末，小时对于个人行为的影响是较大的(例如吃饭时间去饭店，看电影时间去电影院等)，所以才做下面的处理
data.insert(data.shape[1], 'day', time_value.day) # data.shape[1] 是代表插入到最后的意思，一个月的哪一天
data.insert(data.shape[1], 'hour', time_value.hour) # 是否去一个地方打卡，早上，中午，晚上是有影响的
data.insert(data.shape[1], 'weekday', time_value.weekday) # 0代表周一，6代表周日，星期几

#
# 把时间戳特征删除
data = data.drop(['time'], axis=1)
print('-' * 50)
data.head()

Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
600,600,1.2214,2.7023,17,6683426742,1,18,3
957,957,1.1832,2.6891,58,6683426742,10,2,5
4345,4345,1.1935,2.655,11,6889790653,5,15,0
4735,4735,1.1452,2.6074,49,6822359752,6,23,1
5580,5580,1.0089,2.7287,19,1527921905,9,11,4


In [17]:
data.describe()

Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
count,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0
mean,14505690.0,1.122538,2.632309,82.482101,5129895000.0,5.101863,11.485545,3.092377
std,8353805.0,0.077086,0.070144,113.613227,2357399000.0,2.709287,6.932195,1.680218
min,600.0,1.0001,2.5001,1.0,1012024000.0,1.0,0.0,0.0
25%,7327816.0,1.0492,2.5738,25.0,3312464000.0,3.0,6.0,2.0
50%,14430710.0,1.1233,2.6423,62.0,5261906000.0,5.0,12.0,3.0
75%,21634630.0,1.1905,2.6878,75.0,6766325000.0,7.0,17.0,4.0
max,29112150.0,1.2499,2.7499,1004.0,9980711000.0,10.0,23.0,6.0


In [19]:
place_counts = data.groupby('place_id').count()
place_counts

Unnamed: 0_level_0,row_id,x,y,accuracy,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1012023972,1,1,1,1,1,1,1
1057182134,1,1,1,1,1,1,1
1059958036,3,3,3,3,3,3,3
1085266789,1,1,1,1,1,1,1
1097200869,1044,1044,1044,1044,1044,1044,1044
...,...,...,...,...,...,...,...
9904182060,1,1,1,1,1,1,1
9915093501,1,1,1,1,1,1,1
9946198589,1,1,1,1,1,1,1
9950190890,1,1,1,1,1,1,1


In [20]:
place_counts['x'].describe()

count     805.000000
mean       22.000000
std        88.955632
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max      1044.000000
Name: x, dtype: float64

In [22]:
# 这一步是缺失的：按地点ID (place_id) 分组，并统计每个地点的签到次数
# count() 会计算每个列的非空值数量，row_id 此时代表签到次数
place_count = data.groupby('place_id').count()

# 我们可以打印看一下，确认 place_count 已经生成了
print(place_count.head())

            row_id     x     y  accuracy   day  hour  weekday
place_id                                                     
1012023972       1     1     1         1     1     1        1
1057182134       1     1     1         1     1     1        1
1059958036       3     3     3         3     3     3        3
1085266789       1     1     1         1     1     1        1
1097200869    1044  1044  1044      1044  1044  1044     1044


In [23]:
# 把index变为0,1,2, 3,4,5,6这种效果，从零开始排，原来的index是row_id
# 只选择去的人大于3的数据，认为1,2,3的是噪音，这个地方去的人很少，不用推荐给其他人
# 把index变为0,1,2, 3,4,5,6这种效果，从零开始排，原来的index是row_id
# 只选择去的人大于3的数据，认为1,2,3的是噪音，这个地方去的人很少，不用推荐给其他人
tf = place_count[place_count.row_id > 3].reset_index()
tf

# #剩下的签到地点

# # 剩下的签到地点

Unnamed: 0,place_id,row_id,x,y,accuracy,day,hour,weekday
0,1097200869,1044,1044,1044,1044,1044,1044,1044
1,1228935308,120,120,120,120,120,120,120
2,1267801529,58,58,58,58,58,58,58
3,1278040507,15,15,15,15,15,15,15
4,1285051622,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...
234,9741307878,5,5,5,5,5,5,5
235,9753855529,21,21,21,21,21,21,21
236,9806043737,6,6,6,6,6,6,6
237,9809476069,23,23,23,23,23,23,23


In [24]:
data=data[data.place_id.isin(tf.place_id)]
data.shape

(16918, 8)

In [25]:
# # 取出数据当中的特征值和目标值
y = data['place_id']

# 删除目标值，保留特征值，
x = data.drop(['place_id'], axis=1)

# 删除无用的特征值，row_id是索引，这就是噪音
x = x.drop(['row_id'], axis=1)

print(x.shape)
print(x.columns)

(16918, 6)
Index(['x', 'y', 'accuracy', 'day', 'hour', 'weekday'], dtype='str')


In [26]:
# # 进行数据的分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# 特征工程 (标准化)，下面3行注释，一开始我们不进行标准化，看下效果，目标值要不要标准化？
std = StandardScaler()
# #
# # # 对测试集和训练集的特征值进行标准化，服务于knn fit
x_train = std.fit_transform(x_train)
# # transform返回的是copy，不在原本的输入对象中去修改
# print(id(x_test))
print(std.mean_)
print(std.var_)

x_test = std.transform(x_test)  # transfrom不再进行均值和方差的计算，是在原有的基础上去标准化
print('-' * 50)
# print(id(x_test))
print(std.mean_)
print(std.var_)

[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]
--------------------------------------------------
[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]


In [35]:
# # 进行算法流程 # 超参数，可以通过设置n_neighbors=5，来调整结果好坏
knn = KNeighborsClassifier(n_neighbors=8)

# # fit, predict,score. 训练，knn的fit是不训练的，只是把训练集的特征值和目标值放入到内存中
knn.fit(x_train, y_train)

# # #
# # # 得出预测结果
y_predict = knn.predict(x_test)
# #
print("预测的目标签到位置为：", y_predict[0:10])

# # #
# # # # 得出准确率，是评估指标
print("预测的准确率：", knn.score(x_test, y_test))
# print(y_predict)
# y_test

预测的目标签到位置为： [1913341282 1097200869 6097504486 9632980559 6424972551 1097200869
 8048985799 6683426742 1435128522 3312463746]
预测的准确率： 0.48156028368794324


In [28]:
print(y_test[0:10])

16751286    1893548673
12423167    1097200869
7517023     6097504486
4400015     9632980559
26212472    6424972551
7089828     4022692381
10935607    2327054745
25025511    3533177779
27755137    1435128522
19678934    3312463746
Name: place_id, dtype: int64


In [37]:
# #网络搜索时讲解
# # 构造一些参数（超参）的值进行搜索
param = {"n_neighbors": [3, 5, 10, 12, 15], 'weights':['uniform', 'distance']}

#
# 进行网络搜索, cv=3是3折交叉验证, 用其中2折训练, 1折验证
gc = GridSearchCV(knn, param_grid=param, cv=3)

# 训练
gc.fit(x_train, y_train)  #你给它的x_train. 它又分为训练集, 验证集

# 预测准确率, 为了给大家看看
print("在测试集上准确率：", gc.score(x_test, y_test))

print("在交叉验证当中最好的结果：", gc.best_score_) #最好的结果

print("选择最好的模型是：", gc.best_estimator_) #最好的模型, 告诉使用了哪些参数

print("每个超参数每次交叉验证的结果：")
gc.cv_results_

{'mean_fit_time': array([0.00670743, 0.0063796 , 0.00694116, 0.00712578, 0.0067416 ,
        0.00680598, 0.02310999, 0.03037413, 0.0290885 , 0.02550809]),
 'std_fit_time': array([0.0002956 , 0.00016718, 0.00013742, 0.00015378, 0.00011064,
        0.00034947, 0.02384658, 0.01590135, 0.00697614, 0.01221116]),
 'mean_score_time': array([0.02433324, 0.03313017, 0.03058306, 0.04263504, 0.03964782,
        0.05705516, 0.12318905, 0.2932082 , 0.2242101 , 0.3838915 ]),
 'std_score_time': array([0.00260341, 0.00029843, 0.00063427, 0.00138743, 0.00083127,
        0.00137836, 0.06844405, 0.08823569, 0.02522227, 0.02121694]),
 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 10, 10, 12, 12, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value=999999),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'uniform', 'distance', 'uniform', 'distance',
            