In [77]:
%matplotlib notebook

import os
import sys
from operator import itemgetter
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from ipyleaflet import (Map, GeoJSON)

sns.set()

from sklearn.cluster import DBSCAN

import matplotlib.dates as mdates
from matplotlib.colors import rgb2hex
import json

def get_geojson(features):
    return {
        'type': 'FeatureCollection',
        'features': features
    }

def save_geojson(features, directory, file_name):
    if not os.path.exists(directory): os.makedirs(directory)
    f = os.path.join(directory, file_name + '.geojson')
    geojson = {
      'type': 'FeatureCollection',
      'features': features
    }
    with open(f, 'w') as outfile:
        json.dump(geojson, outfile, indent = 4)
    print('Saved to ' + f)

def to_geojson(df, groupby, lat, lng, cols, dumps=True):

    def get_features(row, color):
        properties = { k: str(v) for k,v in zip(cols,[row[col] for col in cols]) }
        properties['marker-color'] = rgb2hex(color[:3])
        return {
            'type': 'Feature',
                'geometry': {
                'type': 'Point',
                'coordinates': [row[lng], row[lat]]
            },
            'properties': properties
        }

    clusters = df.groupby(groupby)

    features = []
    colors = plt.cm.Spectral(np.linspace(0, 1, len(clusters)))
    for name, group in clusters:
        i = np.random.randint(colors.shape[0])
        color = colors[i]
        group.apply(lambda row: features.append(get_features(row, color)), axis=1)
        colors = np.delete(colors, i, 0)

    if dumps:
        return json.dumps(get_geojson(features))
    return get_geojson(features)

In [78]:
nb_dir = os.path.normpath(os.path.join(os.getcwd(), '..'))
os.listdir(nb_dir)
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

f = '../data/UTSEUS-shanghai-dianping.db'

In [82]:
import sqlite3
conn = sqlite3.connect(f)
cursor = conn.cursor()
cursor.execute("""
SELECT tag, longitude, latitude, c.category, avg_rating, avg_price, v.business_id, name, city
FROM venues v
LEFT OUTER JOIN venue_categories vc ON (v.business_id = vc.business_id)
    LEFT OUTER JOIN categories c ON (vc.category = c.category)
        LEFT OUTER JOIN venues_tags vt ON (v.business_id = vt.business_id)
WHERE
 (c.parent = '美食' OR c.category = '美食')
    AND longitude > 115
    AND longitude < 123
    AND avg_price > 0
""")
venues = pd.DataFrame(cursor.fetchall())
venues.columns = ['cn_tag', 'longitude', 'latitude', 'category', 'avg_rating', 'avg_price', 'business_id', 'name', 'city']

In [83]:
venues.head()

Unnamed: 0,cn_tag,longitude,latitude,category,avg_rating,avg_price,business_id,name,city
0,囊包肉,121.49044,31.288305,新疆菜,30.0,81.0,500000,阿凡提美食娱乐城(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海
1,大盘鸡,121.49044,31.288305,新疆菜,30.0,81.0,500000,阿凡提美食娱乐城(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海
2,羊肉串,121.49044,31.288305,新疆菜,30.0,81.0,500000,阿凡提美食娱乐城(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海
3,菌类,121.445274,31.246317,云南菜,30.0,71.0,500146,傣家村(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海
4,菌菇类,121.445274,31.246317,云南菜,30.0,71.0,500146,傣家村(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海


In [84]:
grouped = venues.groupby('business_id')['cn_tag'].apply(list)

In [87]:
venues = venues.drop_duplicates(subset=list(venues)[1:]).drop('cn_tag', axis=1)

In [88]:
result = pd.merge(venues, grouped.reset_index(), on='business_id')

In [93]:
len(result.index)
result.head()

Unnamed: 0,longitude,latitude,category,avg_rating,avg_price,business_id,name,city,cn_tag
0,121.49044,31.288305,新疆菜,30.0,81.0,500000,阿凡提美食娱乐城(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海,"[囊包肉, 大盘鸡, 羊肉串]"
1,121.445274,31.246317,云南菜,30.0,71.0,500146,傣家村(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海,"[菌类, 菌菇类, 菠萝饭]"
2,121.46014,31.221214,湘菜,35.0,95.0,500156,滴水洞(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海,"[剁椒鱼头, 孜然排骨, 酸豆角肉末]"
3,121.499664,31.239138,自助餐,30.0,296.0,500164,东方明珠旋转餐厅(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海,"[三文鱼, 法式蜗牛, 风景]"
4,121.44383,31.201677,自助餐,35.0,217.0,500201,富豪环球白玉兰西餐厅(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...),上海,"[三文鱼, 牛排, 甜品]"


In [94]:
result.to_dict(orient='records')

[{'avg_price': 81.0,
  'avg_rating': 30.0,
  'business_id': 500000,
  'category': '新疆菜',
  'city': '上海',
  'cn_tag': ['囊包肉', '大盘鸡', '羊肉串'],
  'latitude': 31.288305,
  'longitude': 121.49044,
  'name': '阿凡提美食娱乐城(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...)'},
 {'avg_price': 71.0,
  'avg_rating': 30.0,
  'business_id': 500146,
  'category': '云南菜',
  'city': '上海',
  'cn_tag': ['菌类', '菌菇类', '菠萝饭'],
  'latitude': 31.246317,
  'longitude': 121.445274,
  'name': '傣家村(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...)'},
 {'avg_price': 95.0,
  'avg_rating': 35.0,
  'business_id': 500156,
  'category': '湘菜',
  'city': '上海',
  'cn_tag': ['剁椒鱼头', '孜然排骨', '酸豆角肉末'],
  'latitude': 31.221214,
  'longitude': 121.46014,
  'name': '滴水洞(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...)'},
 {'avg_price': 296.0,
  'avg_rating': 30.0,
  'business_id': 500164,
  'category': '自助餐',
  'city': '上海',
  'cn_tag': ['三文鱼', '法式蜗牛', '风景'],
  'latitude': 31.239138,
  'longitude': 121.499664,
  'name': '东方明珠旋转餐厅(这是一条测试商户数据，仅用于测试开发，开发完成后请申请正式数据...)'},
 {'av