In [307]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import math
from math import radians, cos, sin, asin, sqrt
import sys
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

In [231]:
# 左下角经纬度坐标
lb_lon = 121.20120490000001
lb_lat = 31.28175691

# 右上角经纬度坐标
rt_lon = 121.2183295
rt_lat = 31.29339344

# 栅格边长（左下角栅格坐标为 (0, 0) ）
grid_len = 20

In [232]:
# 根据两点经纬度计算两点间距离
def haversine(lon1, lat1, lon2, lat2): # 经度1，纬度1，经度2，纬度2 （十进制度数）  
    """ 
    Calculate the great circle distance between two points  
    on the earth (specified in decimal degrees) 
    """  
    # 将十进制度数转化为弧度  
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])  
  
    # haversine公式  
    dlon = lon2 - lon1   
    dlat = lat2 - lat1   
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2  
    c = 2 * asin(sqrt(a))   
    r = 6371 # 地球平均半径，单位为公里  
    return c * r * 1000  

In [233]:
# 横向最大距离
max_distance_x =  haversine(lb_lon, lb_lat, rt_lon, lb_lat)
# 右上角栅格横坐标
int(max_distance_x / 20)

81

In [234]:
# 纵向最大距离
max_distance_y = haversine(lb_lon, lb_lat, lb_lon, rt_lat)
# 右上角栅格纵坐标
int(max_distance_y / 20)

64

In [235]:
# 将地图栅格化，将经纬度坐标转化为栅格坐标，左下角的栅格坐标是 (0, 0) ，右上角的栅格坐标是 (81, 64)
# 再将栅格坐标转化为栅格ID（0 ～ 82 * 65 - 1 = 5329）共 5330 个栅格，左下角的栅格ID是 0 ，右上角的栅格ID是 5329
def ll_to_gridID(lon, lat):
    # 左下角经纬度坐标，栅格边长
    global lb_lon, lb_lat, grid_len
    
    # 将经纬度坐标转化为栅格坐标
    X = int(haversine(lon, lb_lat, lb_lon, lb_lat) / grid_len) 
    Y = int(haversine(lb_lon, lat, lb_lon, lb_lat) / grid_len)
    
    # 将栅格坐标转化为栅格ID：gridID = X + Y * 82
    gridID = X + Y * 82
    
    return (gridID)

In [None]:
def gridID_to_ll(gridID):
    # 左下角经纬度坐标，右上角经纬度坐标，栅格边长
    global lb_lon, lb_lat, rt_lon, rt_lat
    
    # 将栅格ID转化为栅格坐标
    X = gridID % 82
    Y = int(gridID / 82)
    
    # 将栅格坐标转化为经纬度坐标
    # 栅格的边和栅格的中心位置将横向长度划分为 82 * 2 = 164 份
    # 栅格的边和栅格的中心位置将纵向长度划分为 65 * 2 = 130 份
    # 每一份的经度差：
    delta_lon = (rt_lon - lb_lon) / 164
    # 每一份的纬度差：
    delta_lat = (rt_lat - lb_lat) / 130
    # 栅格坐标横坐标为X的栅格的中心点经度：
    lon = lb_lon + (1 + 2 * X) * delta_lon
    # 栅格坐标纵坐标为Y的栅格的中心点经度：
    lat = lb_lat + (1 + 2 * Y) * delta_lat
    
    return (lon, lat)

In [236]:
# 合并两张表，用 data_2g 中的 RNCID_1，CellID_1 与 gongcan_2g 的 RNCID，CellID 匹配，将基站的经纬度信息加到 data_2g 中
def merge_data_gongcan():
    data_2g = pd.read_csv('../raw_data/data_2g.csv')
    gongcan_2g = pd.read_csv('../raw_data/2g_gongcan.csv')
    
    for i in range(1, 8):
        # 换掉 gongcan_2g 的列名用以和 data_2g merge
        gongcan_2g.columns = ['RNCID_' + str(i), 'CellID_' + str(i), 'Lat_' + str(i), 'Lon_' + str(i)]    
        data_2g = pd.merge(data_2g, gongcan_2g, how='left', on=['RNCID_' + str(i), 'CellID_' + str(i)])
        
    # 将 RSSI_1 ~ RSSI_7 的空缺值nan用 -sys.maxsize - 1 来代替
    for j in range(1, 8):
        data_2g['RSSI_' + str(j)] = data_2g['RSSI_' + str(j)].fillna(-sys.maxsize - 1)
    
    # 将其余空缺值nan替换为-1
    data_2g = data_2g.fillna(-1)
    return data_2g

In [237]:
# 根据MR数据的GPS加上栅格ID
def add_gridID(data):
    data['GridID'] = data.apply(lambda x: ll_to_gridID(x.Longitude, x.Latitude), axis = 1)
    return data

In [238]:
# 随机选取80%的数据记录作为训练集，余下20%作为测试集合
def data_train_test_split(data):
    # 样本特征集（将 DataFrame 转化为 list ）
    X = data[['Lon_1', 'Lat_1', 'RSSI_1', 
              'Lon_2', 'Lat_2', 'RSSI_2', 
              'Lon_3', 'Lat_3', 'RSSI_3', 
              'Lon_4', 'Lat_4', 'RSSI_4',
              'Lon_5', 'Lat_5', 'RSSI_5',
              'Lon_6', 'Lat_6', 'RSSI_6',
              'Lon_7', 'Lat_7', 'RSSI_7',
             ]].values
    # 样本结果（将 DataFrame 转化为 list ）
    y = data['GridID'].values
    
    # 随机选取80%的数据记录作为训练集，余下20%作为测试集合
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    
    return X_train, X_test, y_train, y_test

In [None]:
# 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB)
def gaussian_nb(X_train, X_test, y_train):
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
# K近邻分类器 KNeighborsClassifier
def k_neighbors_classifier(X_train, X_test, y_train):
    neigh = KNeighborsClassifier(n_neighbors=2)
    y_pred = neigh.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
# 决策树分类器 DecisionTreeClassifier
def decision_tree_classifier(X_train, X_test, y_train):
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=50)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
# 随机森林 RandomForestClassifier
def random_forest_classifier(X_train, X_test, y_train):
    clf = RandomForestClassifier(max_depth=3, n_estimators=45)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
# AdaBoost AdaBoostClassifier
def ada_boost_classifier(X_train, X_test, y_train):
    clf = AdaBoostClassifier(n_estimators=150, learning_rate = 0.1)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
# Bagging meta-estimator（Bagging 元估计器）BaggingClassifier
def bagging_classifier(X_train, X_test, y_train):
    clf = BaggingClassifier(n_estimators=40)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
# GBDT（梯度树提升）GradientBoostingClassifier
def gradient_boosting_classifier(X_train, X_test, y_train):
    clf = GradientBoostingClassifier(n_estimators=40, max_depth=10)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    return y_pred

In [None]:
def train_classifier_and_predict(data):
   
    # 随机选取80%的数据记录作为训练集，余下20%作为测试集合
    X_train, X_test, y_train, y_test = data_train_test_split(data)
    
    
    # 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB) 测试结果
    y_pred_gnb = gaussian_nb(X_train, X_test, y_train)
    
    # K近邻分类器 KNeighborsClassifier 测试结果
    y_pred_knc = k_neighbors_classifier(X_train, X_test, y_train)
    
    # 决策树分类器 DecisionTreeClassifier 测试结果 
    y_pred_dtc = decision_tree_classifier(X_train, X_test, y_train)
    
    # 随机森林 RandomForestClassifier 测试结果 
    y_pred_rfc = random_forest_classifier(X_train, X_test, y_train)
    
    # AdaBoost AdaBoostClassifier 测试结果
    y_pred_abc = ada_boost_classifier(X_train, X_test, y_train)
    
    # Bagging meta-estimator（Bagging 元估计器）BaggingClassifier 测试结果
    y_pred_bc = bagging_classifier(X_train, X_test, y_train)
    
    # GBDT（梯度树提升）GradientBoostingClassifier 测试结果
    y_pred_gbc = gradient_boosting_classifier(X_train, X_test, y_train)
    
    return y_pred_gnb, y_pred_knc, y_pred_dtc, y_pred_rfc, y_pred_abc, y_pred_bc, y_pred_gbc

In [None]:
def main():
    # 合并两张表
    data = merge_data_gongcan()
    
    # 根据MR数据的GPS换算成栅格ID作为结果
    data = add_gridID(data)
    
    # 训练分类器并且对测试集作出预测
    y_pred_gnb, y_pred_knc, y_pred_dtc, y_pred_rfc, y_pred_abc, y_pred_bc, y_pred_gbc = train_classifier_and_predict(data)
    
    return y_pred_gnb, y_pred_knc, y_pred_dtc, y_pred_rfc, y_pred_abc, y_pred_bc, y_pred_gbc

In [None]:
y_pred_gnb, y_pred_knc, y_pred_dtc, y_pred_rfc, y_pred_abc, y_pred_bc, y_pred_gbc = main()

In [None]:
y_pred_gnb

In [302]:
y_pred_knc

array([1021, 3417,   64, ..., 3493,  291, 4813])

In [303]:
y_pred_dtc

array([1021, 3417, 2884, ..., 3493, 1548, 4896])

In [304]:
y_pred_rfc

array([2352, 3417, 1020, ..., 3417, 3417, 3417])

In [305]:
y_pred_abc

array([3417, 3417, 1020, ..., 1020, 1020, 1020])

In [306]:
y_pred_bc

array([1021, 3417, 2719, ..., 3493, 1466, 4813])

In [None]:
y_pred_gbc