# 使用mongoDB整理openstreet北京市区的相关数据

## 1.通过示例程序，从原始数据中提取样本的程序

In [None]:
"""通过示例程序，从原始数据中提取除了mp.osm中提取出样本程序，k可以控制程序的抽样距离，当k=1时，样本等于原始数据"""
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "map.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k =1# Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'w') as output:
    output.write('<?xml version="1.0" encoding="utf-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            s=ET.tostring(element).decode("utf-8")
            output.write(s)

    output.write('</osm>')



# 读取样本内容，并且把样本内容转换成json格式，准备数据入库


In [25]:
"""本程序的内容主要是把样本sample.osm中的XML内容读取并且转换成为json格式，准备通过mongoimport导入到相应的mongoDB数据库中，
在程序中，首先对所有的系统标签进行了分析，通过count_tag(file_in)函数，对系统所有node和way下面的标签进行了统计分析，从统计分析的结果，
抽取了其中典型的标签作为json数据模型中的标签。首先是node,way,relation标签所共有的数据构建出如下格式的json
{ 
 id:
 version:
 type:"node"
 created{ uid:  ,
          user:  ,
          timestam:    ,
          changeset:    ,         
 
 }
 pos["lat","long]
 
 name:
 place:
 amenity:
 cuisine:
 power:
 highway:
 railway:
 public_transport:
 address:{street:     ,
          postcode:     ,
          housenumber
          }
 
 
 
}
{ 
 id:
 version:
 type:"way"
 created{ uid:  ,
          user:  ,
          timestam:    ,
          changeset:    ,         
 
 }
node_refs=[nd: ,nd2: ,.....]
highway:    ,
name:       ,
railway:    ,
bridge      ,
public_transport         ,
}
{
 id:
 version:
 type:"relation"
 created{ uid:  ,
          user:  ,
          timestam:    ,
          changeset:    ,  
          }
 member["ref":      ,"role":    ,"type"     ]         
 
 }
}"""
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

mapping = { "Str":"Street",
           "St":"Street",
           "Lu": "Road",
           "Rd":"Road",
           "Da Jie":"Street",
           "Jie":"Street",
           "Dajie":"Road",
           "jie":"Street",
           "Dalu":"Road"
            }

"""count_tag(file_in)用来计算在xml文件中出现的tag的文件的内容"""

def count_tag(file_in):
    tags={}
    for _, element in ET.iterparse(file_in):
        if element.tag=="node" or element.tag=="way":
            if element.find("tag")!=None:
                for tag in element.iter("tag"):
                    key=tag.attrib["k"]
                    if key in tags:
                        tags[key]=tags[key]+1
                    else:
                        tags[key]=1
                    
    return tags
"""在street字段中出现的缩写问题，通过mapping字典的定义，对street字段中的问题进行修复"""
def repair_street(street):
    if (street.find("Str")>=0) and(street.find("Street")<0):
        street=street.replace("Str",mapping["Str"])
    if (street.find("st")>=0) and(street.find("Street")<0):
        street=street.replace("st",mapping["Str"])
    if street.find("Lu")>=0:
        street=street.replace("Lu",mapping["Lu"])
    if street.find("Rd")>=0:
        street=street.replace("Rd",mapping["Rd"])
    if street.find("Da Jie")>=0:
        street=street.replace("Da Jie",mapping["Da Jie"])
    if street.find("Jie")>=0:
        street=street.replace("Jie",mapping["Jie"])
    if street.find("jie")>=0:
        street=street.replace("jie",mapping["jie"])
    if street.find("Da lu")>=0:
        street=street.replace("Da lu",mapping["Da lu"])
    return street
"""repair_housenumber对设施编号进行清洗，如果编号是数字，则返回编号，否则返回None"""
def repair_housenumber(num):
    if num.isdigit():
       return num
    else:
       return None
def shape_element(element):
    node={}
    if  (element.tag=="node")or(element.tag=="way")or(element.tag=="relation"):
        node["id"]=element.get("id")
        node["version"]=element.get("version")
        node["created"]={}
        node["created"]["changeset"]=element.get("changeset")
        node["created"]["uid"]=element.get("uid")
        node["created"]["user"]=element.get("user")
        node["created"]["timestamp"]=element.get("timestamp")
        if element.tag=="node":
            node["type"]="node"
            node["pos"]=[]
            if (element.get("lat")!=None) and (element.get("lon")!=None):
                node["pos"]=[float(element.get("lat")),float(element.get("lon"))]
            if element.find("tag")!=None:
                for value in element.iter("tag"):
                    k_item=value.get("k")
                    if k_item.find("addr")>=0:
                        node["address"]={}
                    if k_item=="name":
                        node["name"]=value.attrib["v"]
                    if k_item=="place":
                        node["place"]=value.attrib["v"]
                    if k_item=="amenity":
                        node["amenity"]=value.attrib["v"]
                    if k_item=="cuisine":
                        node["cuisine"]=value.attrib["v"]
                    if k_item=="power":
                        node["power"]=value.attrib["v"]
                    if k_item=="highway":
                        node["highway"]=value.attrib["v"]
                    if k_item=="tourism":
                        node["tourism"]=value.attrib["v"]
                        
                    if k_item=="railway":
                        node["railway"]=value.attrib["v"]
                    if k_item=="public_transport":
                        node["public_transport"]=value.attrib["v"]
                    if k_item=="addr:street":
                        node["address"]["street"]=repair_street(value.attrib["v"])
                        
                    if (k_item=="addr:postcode") and(len(value.attrib["v"])==6):#这里需要判定邮政编码长度为6，否则邮政编码不正确，需要忽略
                        node["address"]["postcode"]=value.attrib["v"]
                    if k_item=="addr:housenumber":#这里通过repair_housenumber对地址编号进行清洗，如果地址编号是数字则记录，若为其他，则空缺
                        num=repair_housenumber(value.attrib["v"])
                        if num!=None:
                           node["address"]["housenumber"]=num
                          
                       
                   
            
        if element.tag=="way":
            node["type"]="way"
            node["node_refs"]=[]
            for item in element.iter("nd"):
                node["node_refs"].append(item.attrib["ref"])
            if element.find("tag")!=None:
                for value in element.iter("tag"):
                    k_item=value.get("k")
                    if k_item=="highway":
                        node["highway"]=value.attrib["v"]
                    if k_item=="name":
                        node["name"]=value.attrib["v"]
                    if k_item=="railway":
                        node["railway"]=value.attrib["v"]
                    if k_item=="bridge":
                        node["bridge"]=value.attrib["v"]
                    if k_item=="public_transport":
                        node["public_transport"]=value.attrib["v"]
                    if k_item=="amenity":
                        node["amenity"]=value.attrib["v"]
                        
                        
        if element.tag=="relation":
            node["type"]="relation"
            node["member"]={}
            for m in element.iter("member"):
                node["member"]["ref"]=m.get("ref")
                if m.get("role")=="":
                    node["member"]["role"]=None
                else:
                    node["member"]["role"]=m.get("role")
                node["member"]["type"]=m.get("type")
           
            
            
        
    return node
    
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data
def test():
    
    tags=count_tag('sample.osm')
    #print(tags)
    #print(len(tags))
    
    data = process_map('sample.osm', True)
    pprint.pprint(data[100])
test()

city
suburb
suburb
suburb
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
village
town
village
village
yes
neighbourhood
neighbourhood
village
neighbourhood
{'created': {'changeset': '12068223',
             'timestamp': '2012-06-30T17:41:38Z',
             'uid': '376715',
             'user': 'R438'},
 'id': '30993637',
 'pos': [39.9581211, 116.4804559],
 'type': 'node',
 'version': '11'}
