In [1]:
%matplotlib notebook

import os
import sys
from operator import itemgetter
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
#from ipyleaflet import (Map, GeoJSON)

sns.set()

from sklearn.cluster import DBSCAN

import matplotlib.dates as mdates
from matplotlib.colors import rgb2hex
import json

def get_geojson(features):
    return {
        'type': 'FeatureCollection',
        'features': features
    }

def save_geojson(features, directory, file_name):
    if not os.path.exists(directory): os.makedirs(directory)
    f = os.path.join(directory, file_name + '.geojson')
    geojson = {
      'type': 'FeatureCollection',
      'features': features
    }
    with open(f, 'w') as outfile:
        json.dump(geojson, outfile, indent = 4)
    print('Saved to ' + f)

def to_geojson(df, groupby, lat, lng, cols, dumps=True):

    def get_features(row, color):
        properties = { k: str(v) for k,v in zip(cols,[row[col] for col in cols]) }
        properties['marker-color'] = rgb2hex(color[:3])
        return {
            'type': 'Feature',
                'geometry': {
                'type': 'Point',
                'coordinates': [row[lng], row[lat]]
            },
            'properties': properties
        }

    clusters = df.groupby(groupby)

    features = []
    colors = plt.cm.Spectral(np.linspace(0, 1, len(clusters)))
    for name, group in clusters:
        i = np.random.randint(colors.shape[0])
        color = colors[i]
        group.apply(lambda row: features.append(get_features(row, color)), axis=1)
        colors = np.delete(colors, i, 0)

    if dumps:
        return json.dumps(get_geojson(features))
    return get_geojson(features)

In [8]:
nb_dir = os.path.normpath(os.path.join(os.getcwd(), '..'))
os.listdir(nb_dir)
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

f = '../data/UTSEUS-shanghai-flickr.sqlite'

In [9]:
import sqlite3
conn = sqlite3.connect(f)
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM photos
""")
photos = pd.DataFrame(cursor.fetchall())

In [10]:
photos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,12786061,31.239682,121.497266,23804952@N00,15,22726052.0,2004,9,25,Needle in the Sky,sky 2004 architecture shanghai pearltower dscp8,"Pearl TV Tower, Shanghai",http://farm1.staticflickr.com/9/12786061_a6b55...
1,21048909,31.23438,121.494541,40264825@N00,16,22726050.0,2005,3,22,Skywards,holga cityscape shanghai,Holga Shanghai Places,http://farm1.staticflickr.com/16/21048909_3a5c...
2,21048962,31.23438,121.494541,40264825@N00,16,22726050.0,2005,6,23,Bund2,holga cityscape shanghai,Holga Shanghai Places,http://farm1.staticflickr.com/16/21048962_98b0...
3,21048995,31.23438,121.494541,40264825@N00,16,22726050.0,2005,6,23,RoomWithaView,holga cityscape shanghai hotelrooms,Holga Shanghai Places\n\nCamera: Holga 120N\n,http://farm1.staticflickr.com/15/21048995_8cd6...
4,21049047,31.23438,121.494541,40264825@N00,16,22726050.0,2004,11,30,"Bund, Early Morning (Shanghai)",holga cityscape shanghai,Camera: Holga 120N,http://farm1.staticflickr.com/15/21049047_51f8...


In [19]:
# Don't forget to remove "sample" to get full dataset
data_dbscan = photos[[1, 2, 10]].sample(500)
data_dbscan.columns = ['latitude', 'longitude', 'tags']
data_dbscan.head()
X = data_dbscan[['latitude', 'longitude']].values

In [20]:
kms_per_radian = 6371.0088

def compute_dbscan(meters):
    epsilon = (meters * 0.001) / kms_per_radian
    db = DBSCAN(eps=epsilon, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
    return db.labels_

computings = map(compute_dbscan, range(100, 300))
n_computings = map(lambda x: (len(set(x)), x), computings)
labels = max(n_computings, key=itemgetter(0))[1]

for label in set(labels):
    df = data_dbscan[(labels == label)]
    data_dbscan.loc[df.index, 'cluster_num'] = int(label)

In [21]:
data_dbscan.sample(10)

Unnamed: 0,latitude,longitude,tags,cluster_num
42647,31.2,121.5,square lofi squareformat iphoneography instagr...,12.0
25759,31.224041,121.448621,,-1.0
21046,31.240282,121.437388,square squareformat iphoneography instagramapp...,-1.0
17672,31.2187,121.412529,square nashville squareformat iphoneography in...,-1.0
35901,31.237388,121.48668,,1.0
34799,31.206388,121.467222,kids astrid 上海 ilona eplin 日月光中心 富邦华一银行 富邦華一銀行,-1.0
14993,31.24292,121.440306,china temple dragon buddha jade chine boudha,-1.0
37995,31.247709,121.472618,shanghai,-1.0
34674,31.214966,121.419088,china food yum shanghai chinese eat noodle res...,-1.0
31149,31.238659,121.498661,tower skyline night lights bottle shanghai jin...,-1.0


In [25]:
def compute_middle_point(df):
    tags = df['tags'].str.cat(sep=' ')
    return df.assign(tags=tags)[:1]

final_data = data_dbscan.groupby('cluster_num').apply(lambda df: compute_middle_point(df))
final_data

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,tags,cluster_num
cluster_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1.0,6499,31.237793,121.481194,street travel portrait color 50mm shanghai str...,-1.0
0.0,8435,31.209607,121.463788,square squareformat inkwell iphoneography inst...,0.0
1.0,5708,31.239383,121.485511,china shanghai thebund shanghaishi shanghái sq...,1.0
2.0,34231,31.219683,121.471625,square squareformat iphoneography instagramapp...,2.0
3.0,43209,31.230655,121.47378,1dx asia canontse17mmf4l china chinese eos1dx ...,3.0
4.0,14124,31.238881,121.498019,square squareformat inkwell iphoneography inst...,4.0
5.0,3711,31.224986,121.440977,china shanghai square squareformat juno iphone...,5.0
6.0,5632,31.239022,121.489713,china shanghai 中国 上海 puxi 浦西 shanghai 中国 上海 外灘...,6.0
7.0,7531,31.236305,121.471641,world mms china shanghai sony bund silkfactory...,7.0
8.0,8814,31.222513,121.470152,china holiday shanghai egg eat brunch xintiand...,8.0


In [23]:
blacklist = [
    "chine",
    "shanghai",
    "travel",
    "square",
    "city",
    "china",
    "cn",
    "squareformat",
    "asia",
    "uploaded:by=instagram",
    "shanghaishi",
    "iphoneography",
    "chinese",
    "instagramapp",
    "internations",
    "street",
    u'上海',
    u'中国',
  ]

In [26]:
for word in blacklist:
    final_data['tags'] = final_data['tags'].apply(lambda tags: tags.replace(word, ''))
final_data

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,tags,cluster_num
cluster_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1.0,6499,31.237793,121.481194,portrait color 50mm photography portrait ol...,-1.0
0.0,8435,31.209607,121.463788,format inkwell four:venue=50482debe4b0ac40...,0.0
1.0,5708,31.239383,121.485511,thebund shi shanghái format old houses ...,1.0
2.0,34231,31.219683,121.471625,format,2.0
3.0,43209,31.230655,121.47378,1dx canontse17mmf4l se eos1dx photography w...,3.0
4.0,14124,31.238881,121.498019,format inkwell four:venue=4bee4c152c082d7f...,4.0
5.0,3711,31.224986,121.440977,format juno format format,5.0
6.0,5632,31.239022,121.489713,puxi 浦西 外灘 浦東 river lights nightshot ...,6.0
7.0,7531,31.236305,121.471641,world mms sony bund silkfactory nex7 kitc...,7.0
8.0,8814,31.222513,121.470152,holiday egg eat brunch xintiandi benedict ka...,8.0


In [42]:
documents = final_data.to_dict('records')
for document in documents:
    document['tags'] = document['tags'].split(' ')
    document['tags'] = list(filter(None, document['tags']))
#documents[0]

In [43]:
import pymongo
client = pymongo.MongoClient('localhost', 27017)
db = client.explorify
flickr = db.flickr

In [None]:
flickr.insert_many(documents)

In [44]:
flickr.find().next()

{'_id': ObjectId('5a349c42c7553bc405031317'),
 'cluster_num': -1.0,
 'latitude': 31.242725,
 'longitude': 121.482391,
 'tags': ['architecture',
  'golden',
  'artist',
  'hipster',
  'historic',
  'era',
  '1933',
  'laochangfang',
  '2016',
  'art',
  'shi',
  'bobby',
  'zucco',
  'bobbyzucco',
  'pedrozucco',
  'graffiti',
  'arte',
  'calle',
  'rue',
  'format',
  'juno',
  'geotagged',
  'kmtoin',
  '20150912',
  '媽咪泰料',
  'geo:lat=31216154',
  'geo:lon=121470016',
  'marathon',
  '马拉松',
  'dukong',
  '杜空',
  '2015国际马拉松',
  'internationalmarathon2015',
  'circus',
  'format',
  'food',
  'life',
  'yantai',
  'urban',
  'sonyrx1',
  'format',
  'lark',
  'four:venue=5023c823e4b0862bdc3f9579',
  'architecture',
  'renzopiano',
  'piecebypiece',
  'winter',
  'water',
  'river',
  'canal',
  'ancient',
  'historic',
  'rivertown',
  'zhujiajiao',
  'wintersun',
  'blue',
  'highway',
  'futuristic',
  'iphone5sbackcamera415mmf22',
  'library',
  'libraries',
  'conferences',
  '201

In [306]:
cluster={}
for index, row in data_dbscan.sample(n=100).iterrows():
    if row['cluster_num'] != -1.0:
        if row['cluster_num'] not in cluster:
            cluster[row['cluster_num']] = [[row['latitude'],row['longitude']]]
        else:
            cluster[row['cluster_num']] += [[row['latitude'],row['longitude']]]
        

In [307]:
for num,coord in cluster.items():
    for co in coord:
        print num,co

1.0 [31.240483000000001, 121.48523]
1.0 [31.236618, 121.484241]
1.0 [31.239722, 121.48611099999999]
1.0 [31.240542999999999, 121.485868]
1.0 [31.238038, 121.486069]
1.0 [31.240141000000001, 121.48592499999999]
1.0 [31.240119, 121.486277]
1.0 [31.239722, 121.48611099999999]
2.0 [31.222249999999999, 121.471947]
2.0 [31.221488000000001, 121.46981700000001]
2.0 [31.223561, 121.469964]
2.0 [31.223247000000001, 121.470552]
3.0 [31.216691000000001, 121.455917]
4.0 [31.240911000000001, 121.49896099999999]
4.0 [31.240911000000001, 121.49896099999999]
5.0 [31.230277999999998, 121.470556]
6.0 [31.232886000000001, 121.464277]
6.0 [31.233004999999999, 121.463791]
6.0 [31.233332999999998, 121.46466599999999]
8.0 [31.238710000000001, 121.49055]
10.0 [31.236666, 121.503055]
10.0 [31.236663, 121.503078]
11.0 [31.209605, 121.464088]
11.0 [31.210132999999999, 121.463944]
11.0 [31.209804999999999, 121.465386]
12.0 [31.264709, 121.48102400000001]
12.0 [31.264709, 121.48102400000001]
13.0 [31.22560199999999

In [308]:
tags={}
conn = sqlite3.connect(f)
c = conn.cursor()
for num,coord in cluster.items():
    for co in coord:
        c.execute('''SELECT tags FROM photos WHERE latitude == :lat AND longitude == :long''',{'lat': co[0],'long': co[1]})
        conn.commit()
        rows = c.fetchall()
        if rows[0][0] != None:
            if num not in tags:
                tags[num] = rows[0]
            else:
                tags[num] += rows[0]

In [309]:
blacklist = [
    "chine",
    "shanghai",
    "travel",
    "square",
    "city",
    "china",
    "cn",
    "squareformat",
    "asia",
    "uploaded:by=instagram",
    "shanghaishi",
    "iphoneography",
    "chinese",
    "instagramapp",
    "internations",
    "street",
    u'上海',
    u'中国',
  ]
import operator

In [310]:
finaltag={}
for num,tags in tags.items():
    wordcount={}
    for word in tags[0].split():
        is_blacklisted = False
        for blword in blacklist:
            if blword == word:
                is_blacklisted = True
        if is_blacklisted == False:
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1
    sorted_word = sorted(wordcount.items(), key=operator.itemgetter(1))
    popular_tags = sorted_word[-5:]
    for popular_tag in popular_tags:
        print popular_tag[0],num
        if num not in finaltag:
                finaltag[num] = [popular_tag[0]]
        else:
            finaltag[num] += [popular_tag[0]]

downtown 1.0
shopping 1.0
streetlife 1.0
cloudy 1.0
morning 1.0
ancientchinesearmies 2.0
matchlock 2.0
神机营 2.0
ming 2.0
night 4.0
cityscape 4.0
tower 4.0
wayuphigh 4.0
cityatnight 4.0
lofi 5.0
上海木鴨梨露天餐廳 6.0
geo:lat=31232885 6.0
20150822 6.0
kmtoin 6.0
geotagged 6.0
brannan 11.0
foursquare:venue=4c330e3366e40f473255c78b 11.0
1977 12.0
2016上海培训及杭州 16.0
lorianderik 19.0
lori 19.0
erik 19.0
chinaspree 19.0


In [311]:
print finaltag

{1.0: [u'downtown', u'shopping', u'streetlife', u'cloudy', u'morning'], 2.0: [u'ancientchinesearmies', u'matchlock', u'\u795e\u673a\u8425', u'ming'], 4.0: [u'night', u'cityscape', u'tower', u'wayuphigh', u'cityatnight'], 5.0: [u'lofi'], 6.0: [u'\u4e0a\u6d77\u6728\u9d28\u68a8\u9732\u5929\u9910\u5ef3', u'geo:lat=31232885', u'20150822', u'kmtoin', u'geotagged'], 11.0: [u'brannan', u'foursquare:venue=4c330e3366e40f473255c78b'], 12.0: [u'1977'], 16.0: [u'2016\u4e0a\u6d77\u57f9\u8bad\u53ca\u676d\u5dde'], 19.0: [u'lorianderik', u'lori', u'erik', u'chinaspree']}


In [312]:
import pymongo
client = pymongo.MongoClient('localhost', 27017)
db = client.explorify
flickr = db.flickr

In [313]:
dico_tags =[]
for key in finaltag:
    dico_tag = {}
    dico_tag[str(int(key))] = finaltag[key]
    dico_tags.append(dico_tag)
print dico_tags
flickr.insert_many(dico_tags)

[{'1': [u'downtown', u'shopping', u'streetlife', u'cloudy', u'morning']}, {'2': [u'ancientchinesearmies', u'matchlock', u'\u795e\u673a\u8425', u'ming']}, {'4': [u'night', u'cityscape', u'tower', u'wayuphigh', u'cityatnight']}, {'5': [u'lofi']}, {'6': [u'\u4e0a\u6d77\u6728\u9d28\u68a8\u9732\u5929\u9910\u5ef3', u'geo:lat=31232885', u'20150822', u'kmtoin', u'geotagged']}, {'11': [u'brannan', u'foursquare:venue=4c330e3366e40f473255c78b']}, {'12': [u'1977']}, {'16': [u'2016\u4e0a\u6d77\u57f9\u8bad\u53ca\u676d\u5dde']}, {'19': [u'lorianderik', u'lori', u'erik', u'chinaspree']}]


<pymongo.results.InsertManyResult at 0x11fda1710>