In [124]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import math
from math import radians, cos, sin, asin, sqrt
import sys
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [46]:
# 左下角经纬度坐标
lb_lon = 121.20120490000001
lb_lat = 31.28175691

# 右上角经纬度坐标
rt_lon = 121.2183295
rt_lat = 31.29339344

# 栅格边长（左下角栅格坐标为 (0, 0) ）
grid_len = 20

In [47]:
# 根据两点经纬度计算两点间距离
def haversine(lon1, lat1, lon2, lat2): # 经度1，纬度1，经度2，纬度2 （十进制度数）  
    """ 
    Calculate the great circle distance between two points  
    on the earth (specified in decimal degrees) 
    """  
    # 将十进制度数转化为弧度  
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])  
  
    # haversine公式  
    dlon = lon2 - lon1   
    dlat = lat2 - lat1   
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2  
    c = 2 * asin(sqrt(a))   
    r = 6371 # 地球平均半径，单位为公里  
    return c * r * 1000  

In [48]:
# 横向最大距离
max_distance_x =  haversine(lb_lon, 0, rt_lon, 0)
# 右上角栅格横坐标
int(max_distance_x / 20)

95

In [49]:
# 纵向最大距离
max_distance_y = haversine(0, lb_lat, 0, rt_lat)
# 右上角栅格纵坐标
int(max_distance_y / 20)

64

In [187]:
# 将地图栅格化，将经纬度坐标转化为栅格坐标，左下角的栅格坐标是 (0, 0) ，右上角的栅格坐标是 (95, 64)
# 再将栅格坐标转化为栅格ID（0 ～ 96 * 65 - 1 = 6239）共 6240 个栅格，4左下角的栅格ID是 0 ，右上角的栅格ID是 6239
def ll_to_gridID(lon, lat):
    # 左下角经纬度坐标，栅格边长
    global lb_lon, lb_lat, grid_len
    
    # 将经纬度坐标转化为栅格坐标
    X = int(haversine(lon, 0, lb_lon, 0) / grid_len) 
    Y = int(haversine(0, lat, 0, lb_lat) / grid_len)
    
    # 将栅格坐标转化为栅格ID：gridID = X + Y * 96
    gridID = X + Y * 96
    
    return (gridID)

In [188]:
# 合并两张表，用 data_2g 中的 RNCID_1，CellID_1 与 gongcan_2g 的 RNCID，CellID 匹配，将基站的经纬度信息加到 data_2g 中
def merge_data_gongcan():
    data_2g = pd.read_csv('../raw_data/data_2g.csv')
    gongcan_2g = pd.read_csv('../raw_data/2g_gongcan.csv')
    
    for i in range(1, 8):
        # 换掉 gongcan_2g 的列名用以和 data_2g merge
        gongcan_2g.columns = ['RNCID_' + str(i), 'CellID_' + str(i), 'Lat_' + str(i), 'Lon_'+str(i)]    
        data_2g = pd.merge(data_2g, gongcan_2g, how='left', on=['RNCID_' + str(i), 'CellID_' + str(i)])
        
    # 将 RSSI_1 ~ RSSI_7 的空缺值nan用 -sys.maxsize - 1 来代替
    for j in range(1, 8):
        data_2g['RSSI_' + str(j)] = data_2g['RSSI_' + str(j)].fillna(-sys.maxsize - 1)
    
    # 将其余空缺值nan替换为-1
    data_2g = data_2g.fillna(-1)
    return data_2g

In [189]:
# 根据MR数据的GPS加上栅格ID
def add_gridID(data):
    data['GridID'] = data.apply(lambda x: ll_to_gridID(x.Longitude, x.Latitude), axis = 1)
    return data

In [190]:
# 随机选取80%的数据记录作为训练集，余下20%作为测试集合
def data_train_test_split(data):
    # 样本特征集（将 DataFrame 转化为 list ）
    X = data[['Lon_1', 'Lat_1', 'RSSI_1', 
              'Lon_2', 'Lat_2', 'RSSI_2', 
              'Lon_3', 'Lat_3', 'RSSI_3', 
              'Lon_4', 'Lat_4', 'RSSI_4',
              'Lon_5', 'Lat_5', 'RSSI_5',
              'Lon_6', 'Lat_6', 'RSSI_6',
              'Lon_7', 'Lat_7', 'RSSI_7',
             ]].values
    # 样本结果（将 DataFrame 转化为 list ）
    y = data['GridID'].values
    
    # 随机选取80%的数据记录作为训练集，余下20%作为测试集合
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    
    return X_train, X_test, y_train, y_test

In [204]:
# 高斯朴素贝叶斯分类器Gaussian Naive Bayes (GaussianNB)
def gaussiannb(X_train, X_test, y_train):
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    return y_pred

In [205]:
def main():
    # 合并两张表
    data = merge_data_gongcan()
    
    # 根据MR数据的GPS换算成栅格ID作为结果
    data = add_gridID(data)
    
    # 随机选取80%的数据记录作为训练集，余下20%作为测试集合
    X_train, X_test, y_train, y_test = data_train_test_split(data)
    
    # 高斯朴素贝叶斯分类器Gaussian Naive Bayes (GaussianNB) 测试结果
    y_pred_gnb = gaussiannb(X_train, X_test, y_train)   
    
    return y_pred_gnb

In [206]:
y_pred_gnb = main()