In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import math
from math import radians, cos, sin, asin, sqrt
import sys
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

In [2]:
# 左下角经纬度坐标
lb_lon = 121.20120490000001
lb_lat = 31.28175691

# 上角经纬度坐标
rt_lon = 121.2183295
rt_lat = 31.29339344

# 栅格边长（左下角栅格坐标为 (0, 0) ）
grid_len = 20

In [35]:
# 根据两点经纬度计算两点间距离
def haversine(lon1, lat1, lon2, lat2): # 经度1，纬度1，经度2，纬度2 （十进制度数）  
    """ 
    Calculate the great circle distance between two points  
    on the earth (specified in decimal degrees) 
    """  
    # 将十进制度数转化为弧度  
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])  
  
    # haversine公式  
    dlon = lon2 - lon1   
    dlat = lat2 - lat1   
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2  
    c = 2 * asin(sqrt(a))   
    r = 6371 # 地球平均半径，单位为公里  
    return c * r * 1000  

In [4]:
# 横向最大距离
max_distance_x =  haversine(lb_lon, lb_lat, rt_lon, lb_lat)
# 右上角栅格横坐标
int(max_distance_x / 20)

81

In [5]:
# 纵向最大距离
max_distance_y = haversine(lb_lon, lb_lat, lb_lon, rt_lat)
# 右上角栅格纵坐标
int(max_distance_y / 20)

64

In [6]:
# 将地图栅格化，将经纬度坐标转化为栅格坐标，左下角的栅格坐标是 (0, 0) ，右上角的栅格坐标是 (81, 64)
# 再将栅格坐标转化为栅格ID（0 ～ 82 * 65 - 1 = 5329）共 5330 个栅格，左下角的栅格ID是 0 ，右上角的栅格ID是 5329
def ll_to_gridID(lon, lat):
    # 左下角经纬度坐标，栅格边长
    global lb_lon, lb_lat, grid_len
    
    # 将经纬度坐标转化为栅格坐标
    X = int(haversine(lon, lb_lat, lb_lon, lb_lat) / grid_len) 
    Y = int(haversine(lb_lon, lat, lb_lon, lb_lat) / grid_len)
    
    # 将栅格坐标转化为栅格ID：gridID = X + Y * 82
    gridID = X + Y * 82
    
    return (gridID)

In [7]:
# 将栅格ID转化为栅格中心经纬度
def gridID_to_ll(gridID):
    # 左下角经纬度坐标，右上角经纬度坐标，栅格边长
    global lb_lon, lb_lat, rt_lon, rt_lat
    
    # 将栅格ID转化为栅格坐标
    X = gridID % 82
    Y = int(gridID / 82)
    
    # 将栅格坐标转化为经纬度坐标
    # 栅格的边和栅格的中心位置将横向长度划分为 82 * 2 = 164 份
    # 栅格的边和栅格的中心位置将纵向长度划分为 65 * 2 = 130 份
    # 每一份的经度差：
    delta_lon = (rt_lon - lb_lon) / 164
    # 每一份的纬度差：
    delta_lat = (rt_lat - lb_lat) / 130
    # 栅格坐标横坐标为X的栅格的中心点经度：
    lon = lb_lon + (1 + 2 * X) * delta_lon
    # 栅格坐标纵坐标为Y的栅格的中心点经度：
    lat = lb_lat + (1 + 2 * Y) * delta_lat
    
    return [lon, lat]

In [8]:
# 合并两张表，用 data_2g 中的 RNCID_1，CellID_1 与 gongcan_2g 的 RNCID，CellID 匹配，将基站的经纬度信息加到 data_2g 中
def merge_data_gongcan():
    data_2g = pd.read_csv('../raw_data/data_2g.csv')
    gongcan_2g = pd.read_csv('../raw_data/2g_gongcan.csv')
    
    for i in range(1, 8):
        # 换掉 gongcan_2g 的列名用以和 data_2g merge
        gongcan_2g.columns = ['RNCID_' + str(i), 'CellID_' + str(i), 'Lat_' + str(i), 'Lon_' + str(i)]    
        data_2g = pd.merge(data_2g, gongcan_2g, how='left', on=['RNCID_' + str(i), 'CellID_' + str(i)])
        
    # 将 RSSI_1 ~ RSSI_7 的空缺值nan用 -sys.maxsize - 1 来代替
    for j in range(1, 8):
        data_2g['RSSI_' + str(j)] = data_2g['RSSI_' + str(j)].fillna(-sys.maxsize - 1)
    
    # 将其余空缺值nan替换为-1
    data_2g = data_2g.fillna(-1)
    return data_2g

In [9]:
# 根据MR数据的GPS加上栅格ID
def add_gridID(data):
    data['GridID'] = data.apply(lambda x: ll_to_gridID(x.Longitude, x.Latitude), axis = 1)
    return data

In [10]:
# 随机选取80%的数据记录作为训练集，余下20%作为测试集合
def data_train_test_split(data):
    # 样本特征集（将 DataFrame 转化为 list）
    X = data[['Lon_1', 'Lat_1', 'RSSI_1', 
              'Lon_2', 'Lat_2', 'RSSI_2', 
              'Lon_3', 'Lat_3', 'RSSI_3', 
              'Lon_4', 'Lat_4', 'RSSI_4',
              'Lon_5', 'Lat_5', 'RSSI_5',
              'Lon_6', 'Lat_6', 'RSSI_6',
              'Lon_7', 'Lat_7', 'RSSI_7',
             ]].values
    # 样本结果（将 DataFrame 转化为 list ）
    y = data['GridID'].values
    
    # 随机选取80%的数据记录作为训练集，余下20%作为测试集合
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    
    return X_train, X_test, y_train, y_test

In [11]:
# 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB)
def gaussian_nb(X_train, X_test, y_train):
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    return y_pred

In [12]:
# K近邻分类器 KNeighborsClassifier
def k_neighbors_classifier(X_train, X_test, y_train):
    neigh = KNeighborsClassifier(n_neighbors=2)
    y_pred = neigh.fit(X_train, y_train).predict(X_test)
    return y_pred

In [13]:
# 决策树分类器 DecisionTreeClassifier
def decision_tree_classifier(X_train, X_test, y_train):
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=50)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [14]:
# 随机森林 RandomForestClassifier
def random_forest_classifier(X_train, X_test, y_train):
    clf = RandomForestClassifier(max_depth=3, n_estimators=45)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [15]:
# AdaBoost AdaBoostClassifier
def ada_boost_classifier(X_train, X_test, y_train):
    clf = AdaBoostClassifier(n_estimators=150, learning_rate = 0.1)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [16]:
# Bagging meta-estimator（Bagging 元估计器）BaggingClassifier
def bagging_classifier(X_train, X_test, y_train):
    clf = BaggingClassifier(n_estimators=40)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [17]:
# GBDT（梯度树提升）GradientBoostingClassifier
def gradient_boosting_classifier(X_train, X_test, y_train):
    clf = GradientBoostingClassifier(n_estimators=40, max_depth=10)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [42]:
# 计算计算预测位置和证实位置的误差（采用欧式距离），按照计算误差从小到大进行排序，并计算平均误差
# 返回小到大进行排序的误差和平均误差
def calculate_error_and_sort(y_pred, y_test):
    
    # 将预测结果里的栅格ID转化为经纬度坐标
    y_pred = list(map(gridID_to_ll, y_pred))
    # 将测试结果里的栅格ID转化为经纬度坐标
    y_test = list(map(gridID_to_ll, y_test))
    
    # 将预测结果和测试结果的经纬度坐标放在同一个list里，形成一个二维数组
    temp = [y_pred[i] + y_test[i] for i in range(min(len(y_pred),len(y_test)))]
    
    # 计算各个误差
    errors = [haversine(x[0], x[1], x[2], x[3]) for x in temp]
    # 按照计算误差从小到大进行排序
    sorted_errors = sorted(errors)

    # 计算误差平均值
    errors_avg = np.mean(sorted_errors)
    
    return errors_avg

In [19]:
# 重复10次训练集/测试集的选择和误差计算，求平均误差，绘制平均误差概率分布图
def train_classifier_and_predict_and_error(data):
   
    # 用以存储10次测试集结果
    y_test_list = []
    
    # 用以存储不同分类器的10次预测结果
    y_pred_gnb_list = []
    y_pred_knc_list = []
    y_pred_dtc_list = []
    y_pred_rfc_list = []
    y_pred_abc_list = []
    y_pred_bc_list = []
    y_pred_gbc_list = []
    
    # 用以存储不同分类器的10次平均误差
    errors_avg_gnb_list = []
    errors_avg_knc_list = []
    errors_avg_dtc_list = []
    errors_avg_rfc_list = []
    errors_avg_abc_list = []
    errors_avg_bc_list = []
    errors_avg_gbc_list = []

    # 重复10次
    for i in range(1, 11):

        # 随机选取80%的数据记录作为训练集，余下20%作为测试集合
        X_train, X_test, y_train, y_test = data_train_test_split(data)
        y_test_list.append(y_test)

        # 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB) 预测结果
        y_pred_gnb = gaussian_nb(X_train, X_test, y_train)
        y_pred_gnb_list.append(y_pred_gnb)
        # 平均误差
        errors_avg_gnb = calculate_error_and_sort(y_pred_gnb, y_test)
        errors_avg_gnb_list.append(errors_avg_gnb)
        
        '''
        # K近邻分类器 KNeighborsClassifier 预测结果
        y_pred_knc = k_neighbors_classifier(X_train, X_test, y_train)

        # 决策树分类器 DecisionTreeClassifier 预测结果 
        y_pred_dtc = decision_tree_classifier(X_train, X_test, y_train)

        # 随机森林 RandomForestClassifier 预测结果 
        y_pred_rfc = random_forest_classifier(X_train, X_test, y_train)

        # AdaBoost AdaBoostClassifier 预测结果
        y_pred_abc = ada_boost_classifier(X_train, X_test, y_train)

        # Bagging meta-estimator（Bagging 元估计器）BaggingClassifier 预测结果
        y_pred_bc = bagging_classifier(X_train, X_test, y_train)

        # GBDT（梯度树提升）GradientBoostingClassifier 预测结果
        y_pred_gbc = gradient_boosting_classifier(X_train, X_test, y_train)
        '''
    # 返回预测结果和测试集真实结果
    return y_pred_gnb, y_test

In [20]:
def main():
    # 合并两张表
    data = merge_data_gongcan()
    
    # 根据MR数据的GPS换算成栅格ID作为结果
    data = add_gridID(data)
    
    # 训练分类器并且对测试集作出预测
    y_pred_gnb, y_test = train_classifier_and_predict(data)
    
    calculate_error_and_sort(y_pred_gnb, y_test)

In [39]:
main()

548.794955672
