In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
csv_data = pd.read_csv("iBeacon_RSSI_Labeled.csv")

In [4]:
csv_data.location.value_counts()

K04    34
J04    32
J06    29
J07    27
I07    27
I06    27
I08    26
K05    25
Q05    24
O05    24
J03    24
O04    24
S01    23
K03    23
K06    22
J02    22
L06    22
S02    21
I02    21
M06    20
L04    20
S06    20
S05    20
M04    19
J05    19
I05    19
I03    19
Q03    18
Q04    18
I04    18
       ..
J15     8
U05     8
P06     8
I09     8
V15     8
O02     8
T15     7
P15     7
Q01     6
J10     6
D13     6
P02     6
T03     6
L01     6
R06     6
K01     6
U15     5
I15     5
F08     4
T01     4
S08     4
G15     4
Q02     4
D14     4
Q06     4
E15     4
L08     3
S15     3
O01     2
L09     2
Name: location, Length: 105, dtype: int64

In [5]:
csv_data.describe()

Unnamed: 0,b3001,b3002,b3003,b3004,b3005,b3006,b3007,b3008,b3009,b3010,b3011,b3012,b3013
count,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0
mean,-197.825352,-156.623944,-175.533099,-164.534507,-178.378169,-175.06338,-195.637324,-191.970423,-197.14507,-197.442254,-197.748592,-197.233803,-196.065493
std,16.259105,60.217747,49.452958,56.523261,47.175799,49.596627,22.88098,30.733742,19.160207,17.741632,16.852535,18.541088,22.053924
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
50%,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
75%,-200.0,-78.0,-200.0,-80.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
max,-67.0,-59.0,-56.0,-56.0,-60.0,-62.0,-58.0,-56.0,-55.0,-61.0,-59.0,-60.0,-59.0


In [6]:
#把时间转化成时间戳
import time
for i in range(len(csv_data['date'])):
    dt = csv_data.loc[i,'date']
    timeArray = time.strptime(dt, "%m-%d-%Y %H:%M:%S")#Y 必须大写
    #转换成时间戳
    timestamp = time.mktime(timeArray)
    csv_data.loc[i,'date']=timestamp
csv_data.describe()

Unnamed: 0,date,b3001,b3002,b3003,b3004,b3005,b3006,b3007,b3008,b3009,b3010,b3011,b3012,b3013
count,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0
mean,1470867000.0,-197.825352,-156.623944,-175.533099,-164.534507,-178.378169,-175.06338,-195.637324,-191.970423,-197.14507,-197.442254,-197.748592,-197.233803,-196.065493
std,6710595.0,16.259105,60.217747,49.452958,56.523261,47.175799,49.596627,22.88098,30.733742,19.160207,17.741632,16.852535,18.541088,22.053924
min,1461030000.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,1464142000.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
50%,1474690000.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
75%,1476759000.0,-200.0,-78.0,-200.0,-80.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
max,1476761000.0,-67.0,-59.0,-56.0,-56.0,-60.0,-62.0,-58.0,-56.0,-55.0,-61.0,-59.0,-60.0,-59.0


In [7]:
train_data = csv_data.iloc[:, 1:]
train_target = csv_data.location
print(train_data)
print(train_target)

            date  b3001  b3002  b3003  b3004  b3005  b3006  b3007  b3008  \
0     1476760521   -200   -200   -200   -200   -200    -78   -200   -200   
1     1476760519   -200   -200   -200   -200   -200    -78   -200   -200   
2     1476760517   -200   -200   -200   -200   -200    -77   -200   -200   
3     1476760515   -200   -200   -200   -200   -200    -77   -200   -200   
4     1476760513   -200   -200   -200   -200   -200    -77   -200   -200   
5     1476760511   -200   -200    -82   -200   -200   -200   -200   -200   
6     1476760509   -200   -200    -80   -200   -200    -77   -200   -200   
7     1476760507   -200   -200    -86   -200   -200   -200   -200   -200   
8     1476760505   -200   -200   -200    -75   -200   -200   -200   -200   
9     1476760503   -200   -200   -200    -75   -200   -200   -200   -200   
10    1476760501   -200   -200   -200    -80   -200   -200   -200   -200   
11    1476760499   -200   -200   -200    -78   -200   -200   -200   -200   
12    147676

In [8]:
x1_train, x1_test, y1_train, y1_test = train_test_split(train_data, train_target, test_size=0.3)

In [9]:
# 决策树算法
dt = DecisionTreeClassifier()
dt.fit(x1_train, y1_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
s1_dt = dt.score(x1_train, y1_train)
print("训练集准确率：", s1_dt)

训练集准确率： 0.9285714285714286


In [11]:
s2_dt = dt.score(x1_test, y1_test)
print("测试集准确率：", s2_dt)

测试集准确率： 0.4084507042253521


In [12]:
# 随机森林算法
rf = RandomForestClassifier(n_estimators = 20, oob_score = True)
rf.fit(x1_train, y1_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [13]:
s1_rf = rf.score(x1_train, y1_train)
print("训练集准确率：", s1_rf)

训练集准确率： 0.9245472837022133


In [14]:
s2_rf = rf.score(x1_test, y1_test)
print("测试集准确率：", s2_rf)

测试集准确率： 0.39436619718309857


In [15]:
#求最高准确率
s1_rf_ = []
s2_rf_ = []
for i in range(0, 50):
    rf = RandomForestClassifier(n_estimators = 20)
    rf.fit(x1_train, y1_train)
    s1_rf = rf.score(x1_train, y1_train)
    s1_rf_.append(s1_rf)
    s2_rf = rf.score(x1_test, y1_test)
    s2_rf_.append(s2_rf)
print("训练集准确率：", max(s1_rf_))
print("测试集准确率：", max(s2_rf_))

训练集准确率： 0.9285714285714286
测试集准确率： 0.43427230046948356


In [20]:
# SVM算法
# rbf核函数，设置数据权重
svc = SVC(kernel='rbf', class_weight='balanced')
# 训练模型
clf = svc.fit(x1_train, y1_train)



In [21]:
s1_svc = svc.score(x1_train, y1_train)
print("训练集准确率：", s1_svc)

训练集准确率： 0.7173038229376257


In [22]:
s2_svc = svc.score(x1_test, y1_test)
print("测试集准确率：", s2_svc)

测试集准确率： 0.10093896713615023


In [23]:
# 调参优化
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
svc = SVC(kernel='rbf', class_weight='balanced',)
c_range = np.logspace(-5, 15, 11, base=2)
gamma_range = np.logspace(-9, 3, 13, base=2)
# 网格搜索交叉验证的参数范围，cv=3,3折交叉
param_grid = [{'kernel': ['rbf'], 'C': c_range, 'gamma': gamma_range}]# 参数列表，表明参数范围
grid = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)# 寻找最优参数，传递给模型
# 训练模型
clf = grid.fit(x1_train, y1_train)
# 计算测试集精度
score = grid.score(x1_test, y1_test)
print('精度为%s' % score)



精度为0.5727699530516432


In [24]:
score_train = grid.score(x1_train, y1_train)
print("训练集准确率：", score_train)

训练集准确率： 0.9416498993963782


In [25]:
score_test = grid.score(x1_test, y1_test)
print("测试集准确率：", score_test)

测试集准确率： 0.5727699530516432
