Taking inspiration from http://charlesleifer.com/blog/using-python-and-k-means-to-find-the-dominant-colors-in-images/, 
trying to do some image analysis on the instagram posts I collected.

There's a handy library at https://github.com/neocortex/paletti/blob/master/paletti/paletti.py that uses different ways of calculating the palette. I'll just try using the kmeans with scikitlearn method for now

In [1]:
import numpy as np
import pandas as pd
import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId
import requests
from io import BytesIO
from pprint import pprint
from collections import namedtuple

from sklearn.cluster import KMeans
from skimage import io, color
from PIL import ImageStat
from PIL import Image
import colorsys

client = MongoClient("mongodb://127.0.0.1:11122")

In [2]:
#this is what one entry looks like
db = client['instagram-london']
coll = db.posts
cursor = coll.find().sort('created_time', pymongo.DESCENDING).limit(1)
for i in cursor:
    pprint(i)

{'_id': ObjectId('57f7e4447d2f910a32fe3e0b'),
 'attribution': None,
 'caption': {'created_time': '1475790904',
             'from': {'full_name': 'wendy_jm',
                      'id': '7664364',
                      'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/12328524_1563743330622492_1808689803_a.jpg',
                      'username': 'wendy_jm'},
             'id': '17864314741052633',
             'text': 'Lots of lols 😂 @i.am_they.are_it.is #thecomedystore'},
 'comments': {'count': 0},
 'created_time': '1475790904',
 'filter': 'Clarendon',
 'id': '1355354802003762529_7664364',
 'images': {'low_resolution': {'height': 320,
                               'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/14474141_892824824182402_8982149330433474560_n.jpg?ig_cache_key=MTM1NTM1NDgwMjAwMzc2MjUyOQ%3D%3D.2',
                               'width': 320},
            'standard_resolution': {'height': 640,
                                   

There are multiple aspects of the images that can be interesing to visualize. But the 2 following methods extract the brightness (by converting each image to greyscale and finding the mean of that with ImageStat), and using kmeans to extract the top colors (and convert these to hsv for better sorting as per http://www.alanzucconi.com/2015/09/30/colour-sorting/)

In [2]:
def brightness( im_file ):
    """ Extract brightness of an image by converting it to greyscale
        check the values with stat.extrema but it should be 0-255, 
        ie returned value scaled by 255 for brightness %
    """
    im = Image.open(im_file).convert('L')
    stat = ImageStat.Stat(im)
    return stat.mean[0]

In [3]:
def kmeans_palette(fname, k=5, RSIZE=200, color='RGB'):
    """ Extract a color palette using k-means clustering. """
    origimg = Image.open(fname)
    origimg.thumbnail((RSIZE, RSIZE), Image.ANTIALIAS)
    img = np.array(origimg)
    w, h, d = img.shape
    imarr = np.reshape(img, (w * h, d)).astype('float64')
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=k).fit(imarr)
    labels = kmeans.predict(imarr)
    maincolors = kmeans.cluster_centers_
    rgbColors = maincolors.squeeze()
    # Compute percentage of each main color
    percent, _ = np.histogram(labels, bins=len(maincolors), normed=True)
    percent /= float(percent.sum())
    
    if(color=='HSV'):
        normalized = [[y/255 for y in x] for x in maincolors]
        outputColors = [list(colorsys.rgb_to_hsv(x[0], x[1], x[2])) for x in normalized]
    else:
        outputColors = rgbColors
    return outputColors, percent

In [11]:
brightness(BytesIO(requests.get('https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/14240783_286379521748291_1428124432_n.jpg?ig_cache_key=MTM0MTU1Mzg2MDIyOTQxNjMwMw%3D%3D.2.l')
               .content))

124.838251953125

In [7]:
image = BytesIO(requests.get('https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/14240783_286379521748291_1428124432_n.jpg?ig_cache_key=MTM0MTU1Mzg2MDIyOTQxNjMwMw%3D%3D.2.l')
               .content)
grayScaleImg = Image.open(image).convert('L')
stat = ImageStat.Stat(grayScaleImg)
print(stat.extrema)

[(0, 255)]


In [8]:
print(stat.mean)

[124.838251953125]


In [4]:
db = client['instagram-london']
coll = db.posts
cursor = coll.find({}, {"images.standard_resolution.url": 1}).sort('created_time', pymongo.DESCENDING).limit(4)
for i in cursor:
    pprint(i)

{'_id': ObjectId('57f7e4447d2f910a32fe3e0b'),
 'images': {'standard_resolution': {'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/14474141_892824824182402_8982149330433474560_n.jpg?ig_cache_key=MTM1NTM1NDgwMjAwMzc2MjUyOQ%3D%3D.2'}}}
{'_id': ObjectId('57f7e4437d2f910a32fe3d3d'),
 'images': {'standard_resolution': {'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/14550153_1667636423474968_1482873201661837312_n.jpg?ig_cache_key=MTM1NTM1NDc4MjcwOTg1NzM0MA%3D%3D.2.l'}}}
{'_id': ObjectId('57f7e4447d2f910a32fe3e0c'),
 'images': {'standard_resolution': {'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/14482690_1777502685856533_1908154128803037184_n.jpg?ig_cache_key=MTM1NTM1NDc4NTI5MzIxNTg4NQ%3D%3D.2'}}}
{'_id': ObjectId('57f7e4437d2f910a32fe3c96'),
 'images': {'standard_resolution': {'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/14582419_1644625489164268_1359390817432633344_n.jpg?ig_cache_ke

In [18]:
cursor = coll.find({}, {"images.standard_resolution.url": 1}).sort('created_time', pymongo.DESCENDING).limit(4)
items = []
for i in cursor:
    items.append(i)

In [23]:
brightnessArray = [brightness(BytesIO(requests.get(x['images']['standard_resolution']['url']).content)) for x in items]
print(brightnessArray)

[52.679462890625, 108.6855712890625, 106.79782470703125, 103.95446533203125]


In [58]:
test1 =  kmeans_palette(BytesIO(requests.get(items[0]['images']['standard_resolution']['url']).content), color='HSV')
print(test1)

([[0.6550723601806312, 0.64783948218361875, 0.10522475585906628], [0.6817250102704665, 0.20594010909495555, 0.82245636394269495], [0.91883548971868545, 0.53705494241243235, 0.53873392440252144], [0.66183724670677047, 0.70183932031599505, 0.27737353023863853], [0.65734228474001843, 0.056792185627654687, 0.93010976639459608]], array([ 0.563675,  0.133625,  0.072625,  0.1776  ,  0.052475]))


In [5]:
def insertColorClusterInfo(cursorLimited):
    for item in cursorLimited:
        clustered = kmeans_palette(BytesIO(requests.get(item['images']['standard_resolution']['url']).content), color='HSV')
        colors = clustered[0]
        percentages = clustered[1]
        id = item['_id']
        db.posts2016091.update_one({'_id': id},{'$set': {'images.mainColors': colors, 'images.mainColorPercentages': percentages}}
          , upsert=False)

In [92]:
cursor2 = db.posts20160911.find().limit(5)

In [93]:
for item in cursor2:
    r = requests.get(item['images']['standard_resolution']['url'])
    if(r.status_code == 200):                            
        clustered = kmeans_palette(BytesIO(r.content), color='HSV')
        colors = clustered[0]
        percentages = clustered[1]
        id = item['_id']
        print(colors, percentages, id, type(id))
        db.posts20160911.update_one({'_id': ObjectId(id)},{'$set': {'images_mainColors': list(colors), 'images_mainColorPercentages': list(percentages)}}
          , upsert=False)

[[0.62143042949151328, 0.31349037597969009, 0.18597114778317975], [0.16018416383706358, 0.0040837494739007591, 0.99750557523853989], [0.59102611452724618, 0.55885558681773406, 0.85678518242740342], [0.16303286834912534, 0.099098552460445397, 0.75075658546246482], [0.59681112496629984, 0.10617156633886707, 0.43968443627449227]] [ 0.32155   0.501725  0.0314    0.0494    0.095925] 57f7ea3b7d2f910a321e5485 <class 'bson.objectid.ObjectId'>
[[0.89488476675846607, 6.322461757022352e-05, 0.99830105095996102], [0.68513513513514102, 0.13694405033617763, 0.3864592645568829], [0.7258362293657763, 0.058367879917338994, 0.626807318672151], [0.68086735810249233, 0.3705487448920145, 0.14092965857672518], [0.7857754315879818, 0.025339382874492528, 0.85546174305684219]] [ 0.9211    0.008225  0.024675  0.0143    0.0317  ] 57f7ea3c7d2f910a321e5c31 <class 'bson.objectid.ObjectId'>
[[0.081902626034616302, 0.012313158322373926, 0.97241820388744016], [0.060973153781635071, 0.39639530075636331, 0.4859626391711

In [115]:
a = db.posts20160911.find({'_id': ObjectId('57f7ea3b7d2f910a321e5485')}).count()
print(a)

1


In [108]:
cursor2Limited = db.posts20160911.find({'images_mainColors': {'$exists': False}})

Tried the following but it actually runs into a problem in that the cursor will timeout in 10 mins or so...
```i = 0
for item in cursor2Limited:
    r = requests.get(item['images']['standard_resolution']['url'])
    if(r.status_code == 200): 
        clustered = kmeans_palette(BytesIO(requests.get(item['images']['standard_resolution']['url']).content), color='HSV')
        colors = clustered[0]
        percentages = clustered[1]
        id = item['_id']
        i += 1
        if(i % 300 == 0):
            print("i: {}, id: {}".format(str(i), str(id)))
        db.posts20160911.update_one({'_id': ObjectId(id)},{'$set': {'images_mainColors': list(colors), 'images_mainColorPercentages': list(percentages)}}
          , upsert=False)```

In [8]:
i = 0
for item in db.posts20160911.find({'images_mainColors': {'$exists': False}}).batch_size(30):
    r = requests.get(item['images']['standard_resolution']['url'])
    if(r.status_code == 200): 
        clustered = kmeans_palette(BytesIO(requests.get(item['images']['standard_resolution']['url']).content), color='HSV')
        colors = clustered[0]
        percentages = clustered[1]
        id = item['_id']
        i += 1
        if(i % 300 == 0):
            print("i: {}, id: {}".format(str(i), str(id)))
        db.posts20160911.update_one({'_id': ObjectId(id)},{'$set': {'images_mainColors': list(colors), 'images_mainColorPercentages': list(percentages)}}
          , upsert=False)

In [113]:
client = MongoClient("mongodb://127.0.0.1:11122")
db = client['instagram-london']