In [55]:
import os
import io
import zipfile

import numpy as np
import pandas as pd
import geopandas as gpd

In [56]:
zip_path = 'Chengdu Amap POI GCJ02 2017.zip'


data_all = []

with zipfile.ZipFile(zip_path) as zf:
    file_list = zf.namelist()
    file_list = list(filter(lambda x : x.endswith('.xlsx'), file_list))
    
    for file_name in file_list:
        data = zf.open(file_name)
        data = pd.read_excel(data, 
                             header = 0,
                             usecols = ['name', 'type', 'address', 'lng', 'lat', 'province', 'city', 'district'])
        
        data_all.append(data)
    
data_all = pd.concat(data_all, axis=0, ignore_index=True)
data_all = data_all.rename(columns={'lng' : 'GCJ02_lng', 'lat' : 'GCJ02_lat'})


# Some point belong to multiple categories, split by "|"
data_all['POI_type_merge'] = data_all['type'].str.split('|')
print('\n Number of points belonging to multiple categories: \n',
      data_all['POI_type_merge'].apply(len).value_counts())


# Only use the first category, three category level
data_all['POI_type_merge'] = data_all['POI_type_merge'].str.get(0).str.split(';')


# Category level
print('\n Number of points belonging to multiple category levels: \n',
      data_all['POI_type_merge'].apply(len).value_counts())


# Three category level
data_all['POI_type_1'] = data_all['POI_type_merge'].str.get(0)
data_all['POI_type_2'] = data_all['POI_type_merge'].str.get(1)
data_all['POI_type_3'] = data_all['POI_type_merge'].str.get(2)

data_all = data_all.drop(['POI_type_merge'], axis=1)
data_all.to_csv('Chengdu_Amap_POI_GCJ02_2017.csv', index=False)


 Number of points belonging to multiple categories: 
 POI_type_merge
1    339879
2      3924
3        61
Name: count, dtype: int64

 Number of points belonging to multiple category levels: 
 POI_type_merge
3    343864
Name: count, dtype: int64


In [38]:
poi_type = data_all['type'].drop_duplicates() \
                           .reset_index(drop=True) \
                           .str.split('|')