# 土地经纬度信息获取

本程序目的在于方便进行大批量地理位置信息转换，从地址信息转换成经纬度。具体信息可参考LBS百度云。

操作方式如下：

1. 原始数据中提取所需要的地址信息，行政区信息（县、省等）
2. 把相关地址信息和行政区信息导入到info_data中
3. 运行程序，输出两个结果，第一个为地理位置信息的dataframe,第二个则为出错信息的list.

错误与获取信息失败的处理：

* 出错信息用"-"表示，返回无效地址为：""
* "-"可以进行再次尝试,fail_list2 中存储了"-"
* ""可以进行再次尝试,fail_list 中存储了""

## 程序解释

* 函数baidu_gis是进行信息查询的预先处理，遵循LBS百度云api接口相关规定，主要进行密钥加密 （中文编码是一个急需解决的大问题）
* 源数据存储在hdf5格式中，通过pandas 的支持函数读写。 相关问题可以查看hdf5 文件说明
* 


In [2]:
import pandas as pd
import numpy as np
import sys 
# from pandas import Series,DataFrame
import urllib
import hashlib
import requests
import math
import re
import json
import copy

In [None]:
## 数据载入
path = "F:\\DATAbase\\land\\HDF\\"
fm_df = pd.read_excel(path+'fm_df.xlsx')
sm_df = pd.read_excel(path+'sm_df.xlsx')

## GIS函数

In [2]:
### 转换函数
def baidu_gis(queryStr):
    '''
    queryStr 包括了请求百api的url 地址
    以get请求为例http://api.map.baidu.com/geocoder/v2/?address=百度大厦&output=json&ak=yourak
    并且对url 转码 除了保留字符不转换，其余要进行转码加密
    
    '''
    
    GIS_result=[]
    api_link=[]
    error_count=0
    # 对queryStr进行转码，safe内的保留字符不转换
    for ele in queryStr:
        encodedStr=urllib.parse.quote(ele, safe="/:=&?#+!$,;'@()*[]")
        # 在最后直接追加上yoursk
        rawStr = encodedStr + 'DTtxldoesco94o9YZT3RuGlKarBGr7Xv'

        sn = hashlib.md5(urllib.parse.quote_plus(rawStr).encode('utf-8')).hexdigest()

        api_link.append('http://api.map.baidu.com'+ele+"&sn="+sn)
        
        
    return api_link


# In[ ]:

dict_convert={
    'preffix':'/geocoder/v2/?',
    'suffix':'&output=json&ak=9gTAEoFWvBoKHl3u3dFp5ff7',
    'title': u'title=',
    'address': u'&address=',
    'province': u'&province=',
    'city': u'&city=',
    'district': u'&disctrict='    
}
key_id_list=['title','address','province','city','district']


def convert(data):
    '''
    data is a dataframe format saving the 
    info of geo info
    
    '''
    
    row,col=data.shape
    query_info=''
    for ele in data.columns:
        query_info+=dict_convert[ele] + data[ele]

    query_info=dict_convert['preffix']+query_info+dict_convert['suffix']
    query_info.fillna("",inplace=True)
    
    return query_info


In [None]:
## 地址转换
col_name_sm=[u'省',u'市',u'县',u'地址']
col_name_fm=[u'行政区',u'项目位置']
gis_name_sm=['province','city','district','address']
gis_name_fm=['district','address']

flag=input('whether sm or fm')
if flag=='sm':
    data_geo=sm_df[col_name_sm]
    data_geo.rename(columns=dict(zip(col_name_sm,gis_name_sm)),inplace=True)
else:
    data_geo=fm_df[col_name_fm]
    data_geo.rename(columns=dict(zip(col_name_fm,gis_name_fm)),inplace=True)

# data_geo.rename(columns=dict(zip(col_name,gis_name)),inplace=True)


In [6]:
query_info=convert(data_geo)
geo_link=baidu_gis(query_info)

In [None]:
import queue
import threading
import time
import copy, sys 

exitFlag = 0
tn=20
print('we are dealing with: %s' %flag)

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36'}
req=[]
GIS_result=[]
fail_link=[]
class myThread (threading.Thread):
    def __init__(self, threadID, name, q):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.q = q
        self.header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
    def run(self):
        print("Starting " + self.name)
        process_data(self.name, self.q,self.header)

def process_data(threadName, q, header):
    while not exitFlag:
        queueLock.acquire()
        if not workQueue.empty():
            data = q.get()
            try: 
                req = requests.get(data)
#                 print(req.json())
                if req.json()[u'status']==211:
                    print("error occur")
                    time.sleep(1)
                    req = requests.get(data)
                    print(req.json())
                if req.json()[u'status']==211:
                    time.sleep(1)
                    req = requests.get(data)
                    print(req.json())
                try:
                    content = req.json()
                    result = content['result']
                    location = result['location']
                    x = location['lat']
                    y = location['lng']
                    temp_result=(x,y,result['confidence'])
                    GIS_result.append(copy.deepcopy(temp_result))
                except: 
                    temp_result=("","","")
                    GIS_result.append(copy.deepcopy(temp_result))
            except :
                fail_link.append(data)
                temp_result=("-","-","-")
                GIS_result.append(copy.deepcopy(temp_result))
            
            queueLock.release()
            print("%s processing:  %s" % (threadName, temp_result))
        else:
            queueLock.release()
        time.sleep(1)

threadList = ["T1", "T2", "T3","T4"]
queueLock = threading.Lock()
fm_url=[]



workQueue = queue.Queue(tn)
threads = []
threadID = 1

# 创建新线程
for tName in threadList:
    print("create thread")
    thread = myThread(threadID, tName, workQueue)
    thread.start()
    threads.append(thread)
    threadID += 1

# 填充队列
queueLock.acquire()
for word in geo_link[:tn]:
    workQueue.put(word)
queueLock.release()

# 等待队列清空
while not workQueue.empty():
    pass

# 通知线程是时候退出
exitFlag = 1

# 等待所有线程完成
for t in threads:
    t.join()
print("Exiting Main Thread")

GIS_df=pd.DataFrame(GIS_result,columns=['lat','lng','confidence'])
print(GIS_df)

print("the num of empty GIS is : ")
print(len(GIS_df[GIS_df['lat']==""]))

GIS_ind=pd.concat([data_geo,GIS_df],axis=1)



if flag=='sm':
    ## save the result in the total info data
    cand_col_sm=['mindex',u'县',u'地址',u'剩余年限',u'面积',u'转让费',u'容积率']
    sm_GIS=pd.concat([sm_df[cand_col_sm],GIS_df],axis=1)

else:
    ## save the result in the  data
    cand_col_fm=['mindex',u'行政区',u'项目位置',u'土地使用年限',u'面积(公顷)',u'成交价格(万元)',u'约定容积率上限']
    sm_GIS=pd.concat([fm_df[cand_col_fm],GIS_df],axis=1)

In [None]:
GIS_ind=pd.concat([data_geo,GIS_df],axis=1)
GIS_ind

## 保存数据

In [None]:
# ##------------------
# ##  save the data 
# ##------------------
# path = "F:\\DATAbase\\land\\HDF\\"
# with pd.HDFStore(path+'LandDATA.h5',  mode='w') as store:
#     print store.keys
#     store.put('sm_GIS',  sm_GIS,  format='f',append=False)
#     store.put('fm_GIS', fm_GIS, format='f',append=False)
GIS_ind.to_excel(path+'GIS_fm.xlsx','fm')
GIS_ind.to_excel(path+'GIS_fm.xlsx','fm')