In [1]:
import sys, os
import folium
sys.path.append('..')
import flickrapi
import nationalparks.clusters as clusters
import nationalparks.secrets as secrets
import nationalparks.parks as parks
import nationalparks as usnp
import scrapper.fetch_images as fetch_images
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_samples, silhouette_score
import geopandas as gpd
from update_database import update_clusters

In [2]:
parkunit = 'acad'
my_park = usnp.Park(parkunit)

In [3]:
my_park.get_top_photos(29)

[{'_id': '5f01fc64f726a9c76d7fa221',
  'context': 0,
  'dateupload': 1468982546,
  'farm': 9,
  'iconfarm': 5,
  'iconserver': 4120,
  'in_park': True,
  'owner': '52387106@N04',
  'ownername': 'Tone2b',
  'parkunit': 'acad',
  'secret': 'b6523519b3',
  'server': 8741,
  'tags': 'd600 nikon 2470 evening twilight acadia maine water harbor',
  'title': 'Bass Harbor Lighthouse',
  'views': 232,
  'latitude': 44.22186,
  'longitude': -68.33485300000001,
  'core': True,
  'labels': 29,
  'url': 'https://farm9.staticflickr.com/8741/28423359245_b6523519b3.jpg'},
 {'_id': '5f01fc64f726a9c76d7f8858',
  'context': 0,
  'dateupload': 1556640633,
  'farm': 66,
  'iconfarm': 0,
  'iconserver': 0,
  'in_park': True,
  'owner': '89117042@N07',
  'ownername': 'rcribb1',
  'parkunit': 'acad',
  'secret': '7b15f74bf3',
  'server': 65535,
  'tags': nan,
  'title': '128. Bass Harbor Lighthouse, Bass Harbor, Maine',
  'views': 20,
  'latitude': 44.222233,
  'longitude': -68.33761700000001,
  'core': True,


In [None]:
?usnp.db.photos.aggregate

In [5]:
sorted(my_park.tf_idf(1).items(), key=lambda x: x[1],reverse=False)[0:20]

[('fall', 0.03655152080372692),
 ('autumn', 0.03988356551787882),
 ('landscape', 0.04210492866064675),
 ('harbor', 0.05129329438755048),
 ('green', 0.05599869926695494),
 ('mdi', 0.07321311422679666),
 ('travel', 0.08192402300165631),
 ('geotagged', 0.08399804890043243),
 ('panorama', 0.08605825203720759),
 ('summer', 0.09309647339341044),
 ('nps', 0.09693550322406656),
 ('hiking', 0.09810142501210997),
 ('square', 0.09853509898683993),
 ('trees', 0.09934584055137563),
 ('forest', 0.10877251186858954),
 ('nikon', 0.11197170339413627),
 ('clouds', 0.1180501672926751),
 ('outdoors', 0.1180501672926751),
 ('red', 0.11859991900558392),
 ('sky', 0.12412863119121392)]

In [None]:
my_park.get_photos().head()

In [None]:
def term_frequency(df, method='term frequency', K=0.5):
    tag_counters = {}
    total = 0
    for i, row in df[df['tags'].notnull()].iterrows():
        
        ## create tag list
        tag_list = row['tags'].split(' ')
        
        if tag_list and not tag_list is None:
            
            for tag in tag_list:
                if tag in tag_counters.keys():
                    tag_counters[tag] +=1
                else:
                    tag_counters[tag] = 1
                    
    total = sum([x for x in tag_counters.values()])
    
    # clear when df is too high
    to_delete = []
    for k, v in tag_counters.items():
        if v / float(total) >= 0.007:
            to_delete.append(k)
            
    for k in to_delete:
        del tag_counters[k]

    total = sum([x for x in tag_counters.values()])
    
    if method == 'term frequency':
        for k, v in tag_counters.items():
            tag_counters[k] /= total
    
    elif method == 'log normalization':
        for k, v in tag_counters.items():
            tag_counters[k] = np.log(1 + tag_counters[k])
        
    elif method == 'double normalization':
        max_f = float(max([x for x in tag_counters.values()]))
        for k, v in tag_counters.items():
            tag_counters[k] = 0.5 + 0.5 * tag_counters[k] / max_f
            
    elif method == 'double normalization K':
        max_f = float(max([x for x in tag_counters.values()]))
        for k, v in tag_counters.items():
            tag_counters[k] = K + (1 - K) * tag_counters[k] / max_f
        
        
    return tag_counters

def invert_document_frequency(df_all):
    # document count
    N = df_all['labels'].nunique()
    
    # find document occurent
    df = {}
    
    # cluster ids
    clusters = df_all['labels'].unique()
    
    for i in clusters:
        
        word_set = set()
        for i, row in df_all[(df_all['labels']==i) & ~(df_all['tags'].isnull())].iterrows():
            
            tag_list = row['tags'].split(' ')
            
            for tag in tag_list:
                word_set.add(tag)

        for w in word_set:
            if w in df.keys():
                df[w] += 1
            else:
                df[w] = 1
                
    for k, v in df.items():
        df[k] = N / float(df[k])
                
    return df

def tf_idf(tf, idf):
    
    # tf_idf storage
    tf_idf = {}
    
    for k in tf.keys():
        tf_idf[k] = tf[k] * np.log(float(idf[k]))
    
    return tf_idf

In [None]:
df_photos = my_park.get_cluster_photos(2)
df_photos_all = my_park.get_photos()
tf = term_frequency(df_photos, method='double normalization')
idf = invert_document_frequency(df_photos_all)
tfidf = tf_idf(tf, idf)
sorted(tfidf.items(), key=lambda x: x[1],reverse=False)[0:15]

In [None]:
df_photos = my_park.get_cluster_photos(3)
df_photos_all = my_park.get_photos()
tf = term_frequency(df_photos, method='double normalization')
idf = invert_document_frequency(df_photos_all)
tfidf = tf_idf(tf, idf)
sorted(tfidf.items(), key=lambda x: x[1],reverse=False)[0:15]

In [None]:
df_photos = my_park.get_cluster_photos(20)
df_photos_all = my_park.get_photos()
tf = term_frequency(df_photos, method='double normalization')
idf = invert_document_frequency(df_photos_all)
tfidf = tf_idf(tf, idf)
sorted(tfidf.items(), key=lambda x: x[1],reverse=False)[0:15]